From 08a6be139096c60b4fd5be8d51dd8dbcba5c33d5 Mon Sep 17 00:00:00 2001
From: datamining108 <jamesk1228@gmail.com>
Date: Mon, 21 Sep 2020 20:41:18 +0900
Subject: [PATCH 1/4] update post-quantization feature with various model
 supports

---
 post_quantization_validation.py               |  386 ++++
 timm/data/quant_loader.py                     |  252 +++
 timm/models/__init__.py                       |    4 +
 timm/models/quantization/__init__.py          |   10 +
 timm/models/quantization/efficientnet.py      | 1743 +++++++++++++++++
 .../quantization/efficientnet_blocks.py       |  423 ++++
 .../quantization/efficientnet_builder.py      |  414 ++++
 timm/models/quantization/layers/__init__.py   |   31 +
 .../models/quantization/layers/activations.py |   66 +
 .../quantization/layers/activations_jit.py    |   90 +
 .../quantization/layers/activations_me.py     |  208 ++
 .../layers/adaptive_avgmax_pool.py            |  106 +
 .../quantization/layers/anti_aliasing.py      |   60 +
 timm/models/quantization/layers/blur_pool.py  |   58 +
 timm/models/quantization/layers/classifier.py |   41 +
 .../models/quantization/layers/cond_conv2d.py |  122 ++
 timm/models/quantization/layers/config.py     |  115 ++
 .../models/quantization/layers/conv2d_same.py |   42 +
 .../models/quantization/layers/conv_bn_act.py |   40 +
 timm/models/quantization/layers/create_act.py |   36 +
 .../quantization/layers/create_conv2d.py      |   30 +
 .../quantization/layers/create_norm_act.py    |   74 +
 timm/models/quantization/layers/drop.py       |  167 ++
 timm/models/quantization/layers/evo_norm.py   |   83 +
 timm/models/quantization/layers/helpers.py    |   27 +
 .../models/quantization/layers/inplace_abn.py |   87 +
 .../models/quantization/layers/median_pool.py |   49 +
 .../quantization/layers/mixed_conv2d.py       |   51 +
 timm/models/quantization/layers/norm_act.py   |   86 +
 timm/models/quantization/layers/padding.py    |   56 +
 .../models/quantization/layers/pool2d_same.py |   71 +
 .../quantization/layers/selective_kernel.py   |  120 ++
 .../quantization/layers/separable_conv.py     |   74 +
 .../quantization/layers/space_to_depth.py     |   53 +
 timm/models/quantization/layers/split_attn.py |   88 +
 .../quantization/layers/split_batchnorm.py    |   75 +
 .../quantization/layers/test_time_pool.py     |   50 +
 .../models/quantization/layers/weight_init.py |   60 +
 timm/models/quantization/mobilenetv3.py       |  462 +++++
 timm/models/quantization/rexnet.py            |  323 +++
 40 files changed, 6333 insertions(+)
 create mode 100644 post_quantization_validation.py
 create mode 100644 timm/data/quant_loader.py
 create mode 100644 timm/models/quantization/__init__.py
 create mode 100644 timm/models/quantization/efficientnet.py
 create mode 100644 timm/models/quantization/efficientnet_blocks.py
 create mode 100644 timm/models/quantization/efficientnet_builder.py
 create mode 100644 timm/models/quantization/layers/__init__.py
 create mode 100644 timm/models/quantization/layers/activations.py
 create mode 100644 timm/models/quantization/layers/activations_jit.py
 create mode 100644 timm/models/quantization/layers/activations_me.py
 create mode 100644 timm/models/quantization/layers/adaptive_avgmax_pool.py
 create mode 100644 timm/models/quantization/layers/anti_aliasing.py
 create mode 100644 timm/models/quantization/layers/blur_pool.py
 create mode 100644 timm/models/quantization/layers/classifier.py
 create mode 100644 timm/models/quantization/layers/cond_conv2d.py
 create mode 100644 timm/models/quantization/layers/config.py
 create mode 100644 timm/models/quantization/layers/conv2d_same.py
 create mode 100644 timm/models/quantization/layers/conv_bn_act.py
 create mode 100644 timm/models/quantization/layers/create_act.py
 create mode 100644 timm/models/quantization/layers/create_conv2d.py
 create mode 100644 timm/models/quantization/layers/create_norm_act.py
 create mode 100644 timm/models/quantization/layers/drop.py
 create mode 100644 timm/models/quantization/layers/evo_norm.py
 create mode 100644 timm/models/quantization/layers/helpers.py
 create mode 100644 timm/models/quantization/layers/inplace_abn.py
 create mode 100644 timm/models/quantization/layers/median_pool.py
 create mode 100644 timm/models/quantization/layers/mixed_conv2d.py
 create mode 100644 timm/models/quantization/layers/norm_act.py
 create mode 100644 timm/models/quantization/layers/padding.py
 create mode 100644 timm/models/quantization/layers/pool2d_same.py
 create mode 100644 timm/models/quantization/layers/selective_kernel.py
 create mode 100644 timm/models/quantization/layers/separable_conv.py
 create mode 100644 timm/models/quantization/layers/space_to_depth.py
 create mode 100644 timm/models/quantization/layers/split_attn.py
 create mode 100644 timm/models/quantization/layers/split_batchnorm.py
 create mode 100644 timm/models/quantization/layers/test_time_pool.py
 create mode 100644 timm/models/quantization/layers/weight_init.py
 create mode 100644 timm/models/quantization/mobilenetv3.py
 create mode 100644 timm/models/quantization/rexnet.py

diff --git a/post_quantization_validation.py b/post_quantization_validation.py
new file mode 100644
index 00000000..6ad3ce2f
--- /dev/null
+++ b/post_quantization_validation.py
@@ -0,0 +1,386 @@
+#!/usr/bin/env python
+""" ImageNet Validation Script
+
+This is intended to be a lean and easily modifiable ImageNet validation script for evaluating pretrained
+models or training checkpoints against ImageNet or similarly organized image datasets. It prioritizes
+canonical PyTorch, standard Python style, and good performance. Repurpose as you see fit.
+
+Hacked together by Ross Wightman (https://github.com/rwightman)
+"""
+import argparse
+import os
+import csv
+import glob
+import time
+import logging
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+from collections import OrderedDict
+import torch.quantization
+
+try:
+    from apex import amp
+    has_apex = True
+except ImportError:
+    has_apex = False
+
+from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models
+from timm.data import Dataset, DatasetTar, resolve_data_config, RealLabelsImagenet
+from timm.data.quant_loader import create_loader
+from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging
+
+torch.backends.cudnn.benchmark = True
+_logger = logging.getLogger('validate')
+
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
+parser.add_argument('data', metavar='DIR',
+                    help='path to dataset')
+parser.add_argument('--calib-data', metavar='DIR',
+                    help='path to calibration dataset')
+parser.add_argument('--model', '-m', metavar='MODEL', default='dpn92',
+                    help='model architecture (default: dpn92)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 2)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N', help='mini-batch size (default: 256)')
+parser.add_argument('--calib-iter', default=100, type=int,
+                    metavar='N', help='Train set iterations for calibration before quantization')
+parser.add_argument('--img-size', default=None, type=int,
+                    metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--crop-pct', default=None, type=float,
+                    metavar='N', help='Input image center crop pct')
+parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
+                    help='Override mean pixel value of dataset')
+parser.add_argument('--std', type=float,  nargs='+', default=None, metavar='STD',
+                    help='Override std deviation of of dataset')
+parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
+                    help='Image resize interpolation type (overrides model)')
+parser.add_argument('--num-classes', type=int, default=1000,
+                    help='Number classes in dataset')
+parser.add_argument('--class-map', default='', type=str, metavar='FILENAME',
+                    help='path to class to idx mapping file (default: "")')
+parser.add_argument('--log-freq', default=10, type=int,
+                    metavar='N', help='batch logging frequency (default: 10)')
+parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--num-gpu', type=int, default=1,
+                    help='Number of GPUS to use')
+parser.add_argument('--no-test-pool', dest='no_test_pool', action='store_true',
+                    help='disable test time pool')
+parser.add_argument('--no-prefetcher', action='store_true', default=False,
+                    help='disable fast prefetcher')
+parser.add_argument('--pin-mem', action='store_true', default=False,
+                    help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
+parser.add_argument('--amp', action='store_true', default=False,
+                    help='Use AMP mixed precision')
+parser.add_argument('--tf-preprocessing', action='store_true', default=False,
+                    help='Use Tensorflow preprocessing pipeline (require CPU TF installed')
+parser.add_argument('--use-ema', dest='use_ema', action='store_true',
+                    help='use ema version of weights if present')
+parser.add_argument('--torchscript', dest='torchscript', action='store_true',
+                    help='convert model torchscript for inference')
+parser.add_argument('--legacy-jit', dest='legacy_jit', action='store_true',
+                    help='use legacy jit mode for pytorch 1.5/1.5.1/1.6 to get back fusion performance')
+parser.add_argument('--results-file', default='', type=str, metavar='FILENAME',
+                    help='Output csv file for validation results (summary)')
+parser.add_argument('--real-labels', default='', type=str, metavar='FILENAME',
+                    help='Real labels JSON file for imagenet evaluation')
+parser.add_argument('--valid-labels', default='', type=str, metavar='FILENAME',
+                    help='Valid label indices txt file for validation of partial label space')
+
+
+def set_jit_legacy():
+    """ Set JIT executor to legacy w/ support for op fusion
+    This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes
+    in the JIT exectutor. These API are not supported so could change.
+    """
+    #
+    assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!"
+    torch._C._jit_set_profiling_executor(False)
+    torch._C._jit_set_profiling_mode(False)
+    torch._C._jit_override_can_fuse_on_gpu(True)
+    #torch._C._jit_set_texpr_fuser_enabled(True)
+
+
+
+def validate(args):
+    # might as well try to validate something
+    args.pretrained = args.pretrained or not args.checkpoint
+    args.prefetcher = not args.no_prefetcher
+    if args.legacy_jit:
+        set_jit_legacy()
+
+    # create model
+    model = create_model(
+        args.model,
+        pretrained=args.pretrained,
+        num_classes=args.num_classes,
+        in_chans=3,
+        scriptable=args.torchscript)
+
+    if args.checkpoint:
+        load_checkpoint(model, args.checkpoint, args.use_ema)  
+
+    param_count = sum([m.numel() for m in model.parameters()])
+    _logger.info('Model %s created, param count: %d' % (args.model, param_count))
+
+    data_config = resolve_data_config(vars(args), model=model)
+    model, test_time_pool = apply_test_time_pool(model, data_config, args)
+
+    if args.torchscript:
+        torch.jit.optimized_execution(True)
+        model = torch.jit.script(model)
+
+
+    criterion = nn.CrossEntropyLoss()
+
+    if os.path.splitext(args.data)[1] == '.tar' and os.path.isfile(args.data):
+        dataset = DatasetTar(args.data, load_bytes=args.tf_preprocessing, class_map=args.class_map)
+        calib_dataset = DatasetTar(args.calib_data, load_bytes=args.tf_preprocessing, class_map=args.class_map)        
+    else:
+        dataset = Dataset(args.data, load_bytes=args.tf_preprocessing, class_map=args.class_map)
+        calib_dataset = Dataset(args.calib_data, load_bytes=args.tf_preprocessing, class_map=args.class_map)
+    if args.valid_labels:
+        with open(args.valid_labels, 'r') as f:
+            valid_labels = {int(line.rstrip()) for line in f}
+            valid_labels = [i in valid_labels for i in range(args.num_classes)]
+    else:
+        valid_labels = None
+
+    if args.real_labels:
+        real_labels = RealLabelsImagenet(dataset.filenames(basename=True), real_json=args.real_labels)
+    else:
+        real_labels = None
+
+    crop_pct = 1.0 if test_time_pool else data_config['crop_pct']
+    loader = create_loader(
+        dataset,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        crop_pct=crop_pct,
+        pin_memory=args.pin_mem,
+        tf_preprocessing=args.tf_preprocessing)
+    calib_loader = create_loader(
+        calib_dataset,
+        input_size=data_config['input_size'],
+        batch_size=args.batch_size,
+        use_prefetcher=args.prefetcher,
+        interpolation=data_config['interpolation'],
+        mean=data_config['mean'],
+        std=data_config['std'],
+        num_workers=args.workers,
+        crop_pct=crop_pct,
+        pin_memory=args.pin_mem,
+        tf_preprocessing=args.tf_preprocessing)    
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()
+    print('Start calibration of quantization observers before post-quantization')
+    model.eval()
+    model.fuse_model()
+    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
+    print(model.qconfig)
+    torch.quantization.prepare(model, inplace=True)
+    
+    with torch.no_grad():
+        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
+        input = torch.randn((args.batch_size,) + data_config['input_size'])
+        model(input)
+        end = time.time()
+        for batch_idx, (input, target) in enumerate(calib_loader):
+            if batch_idx > args.calib_iter:
+                break
+            if args.no_prefetcher:
+                if args.fp16:
+                    input = input.half()
+
+            # compute output
+            output = model(input)
+            if valid_labels is not None:
+                output = output[:, valid_labels]
+            loss = criterion(output, target)
+
+            if real_labels is not None:
+                real_labels.add_result(output)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output.data, target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            top1.update(acc1.item(), input.size(0))
+            top5.update(acc5.item(), input.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if batch_idx % args.log_freq == 0:
+                _logger.info(
+                    'Test: [{0:>4d}/{1}]  '
+                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f})  '
+                    'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format(
+                        batch_idx, args.calib_iter, batch_time=batch_time,
+                        rate_avg=input.size(0) / batch_time.avg,
+                        loss=losses, top1=top1, top5=top5))
+    
+    if real_labels is not None:
+        # real labels mode replaces topk values at the end
+        top1a, top5a = real_labels.get_accuracy(k=1), real_labels.get_accuracy(k=5)
+    else:
+        top1a, top5a = top1.avg, top5.avg
+    results = OrderedDict(
+        top1=round(top1a, 4), top1_err=round(100 - top1a, 4),
+        top5=round(top5a, 4), top5_err=round(100 - top5a, 4),
+        param_count=round(param_count / 1e6, 2),
+        img_size=data_config['input_size'][-1],
+        cropt_pct=crop_pct,
+        interpolation=data_config['interpolation'])
+
+    _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format(
+       results['top1'], results['top1_err'], results['top5'], results['top5_err']))
+    print('Start validation of post-quantized model')
+    torch.quantization.convert(model.eval(),inplace = True)
+    
+    batch_time = AverageMeter()
+    losses = AverageMeter()
+    top1 = AverageMeter()
+    top5 = AverageMeter()    
+    with torch.no_grad():
+        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
+        input = torch.randn((args.batch_size,) + data_config['input_size'])
+        model(input)
+        end = time.time()
+        for batch_idx, (input, target) in enumerate(loader):
+            if args.no_prefetcher:
+                if args.fp16:
+                    input = input.half()
+
+            # compute output
+            output = model(input)
+            if valid_labels is not None:
+                output = output[:, valid_labels]
+            loss = criterion(output, target)
+
+            if real_labels is not None:
+                real_labels.add_result(output)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output.data, target, topk=(1, 5))
+            losses.update(loss.item(), input.size(0))
+            top1.update(acc1.item(), input.size(0))
+            top5.update(acc5.item(), input.size(0))
+
+            # measure elapsed time
+            batch_time.update(time.time() - end)
+            end = time.time()
+
+            if batch_idx % args.log_freq == 0:
+                _logger.info(
+                    'Test: [{0:>4d}/{1}]  '
+                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                    'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f})  '
+                    'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format(
+                        batch_idx, len(loader), batch_time=batch_time,
+                        rate_avg=input.size(0) / batch_time.avg,
+                        loss=losses, top1=top1, top5=top5))
+
+    if real_labels is not None:
+        # real labels mode replaces topk values at the end
+        top1a, top5a = real_labels.get_accuracy(k=1), real_labels.get_accuracy(k=5)
+    else:
+        top1a, top5a = top1.avg, top5.avg
+    results = OrderedDict(
+        top1=round(top1a, 4), top1_err=round(100 - top1a, 4),
+        top5=round(top5a, 4), top5_err=round(100 - top5a, 4),
+        param_count=round(param_count / 1e6, 2),
+        img_size=data_config['input_size'][-1],
+        cropt_pct=crop_pct,
+        interpolation=data_config['interpolation'])
+
+    _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format(
+       results['top1'], results['top1_err'], results['top5'], results['top5_err']))    
+    return results
+
+
+def main():
+    setup_default_logging()
+    args = parser.parse_args()
+    model_cfgs = []
+    model_names = []
+    if os.path.isdir(args.checkpoint):
+        # validate all checkpoints in a path with same model
+        checkpoints = glob.glob(args.checkpoint + '/*.pth.tar')
+        checkpoints += glob.glob(args.checkpoint + '/*.pth')
+        model_names = list_models(args.model)
+        model_cfgs = [(args.model, c) for c in sorted(checkpoints, key=natural_key)]
+    else:
+        if args.model == 'all':
+            # validate all models in a list of names with pretrained checkpoints
+            args.pretrained = True
+            model_names = list_models(pretrained=True)
+            model_cfgs = [(n, '') for n in model_names]
+        elif not is_model(args.model):
+            # model name doesn't exist, try as wildcard filter
+            model_names = list_models(args.model)
+            model_cfgs = [(n, '') for n in model_names]
+
+    if len(model_cfgs):
+        results_file = args.results_file or './results-all.csv'
+        _logger.info('Running bulk validation on these pretrained models: {}'.format(', '.join(model_names)))
+        results = []
+        try:
+            start_batch_size = args.batch_size
+            for m, c in model_cfgs:
+                batch_size = start_batch_size
+                args.model = m
+                args.checkpoint = c
+                result = OrderedDict(model=args.model)
+                r = {}
+                while not r and batch_size >= args.num_gpu:
+                    torch.cuda.empty_cache()
+                    try:
+                        args.batch_size = batch_size
+                        print('Validating with batch size: %d' % args.batch_size)
+                        r = validate(args)
+                    except RuntimeError as e:
+                        if batch_size <= args.num_gpu:
+                            print("Validation failed with no ability to reduce batch size. Exiting.")
+                            raise e
+                        batch_size = max(batch_size // 2, args.num_gpu)
+                        print("Validation failed, reducing batch size by 50%")
+                result.update(r)
+                if args.checkpoint:
+                    result['checkpoint'] = args.checkpoint
+                results.append(result)
+        except KeyboardInterrupt as e:
+            pass
+        results = sorted(results, key=lambda x: x['top1'], reverse=True)
+        if len(results):
+            write_results(results_file, results)
+    else:
+        validate(args)
+
+
+def write_results(results_file, results):
+    with open(results_file, mode='w') as cf:
+        dw = csv.DictWriter(cf, fieldnames=results[0].keys())
+        dw.writeheader()
+        for r in results:
+            dw.writerow(r)
+        cf.flush()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/timm/data/quant_loader.py b/timm/data/quant_loader.py
new file mode 100644
index 00000000..a179217b
--- /dev/null
+++ b/timm/data/quant_loader.py
@@ -0,0 +1,252 @@
+""" Loader Factory, Fast Collate, CUDA Prefetcher
+
+Prefetcher and Fast Collate inspired by NVIDIA APEX example at
+https://github.com/NVIDIA/apex/commit/d5e2bb4bdeedd27b1dfaf5bb2b24d6c000dee9be#diff-cf86c282ff7fba81fad27a559379d5bf
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch.utils.data
+import numpy as np
+
+from .transforms_factory import create_transform
+from .constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .distributed_sampler import OrderedDistributedSampler
+from .random_erasing import RandomErasing
+from .mixup import FastCollateMixup
+
+
+def fast_collate(batch):
+    """ A fast collation function optimized for uint8 images (np array or torch) and int64 targets (labels)"""
+    assert isinstance(batch[0], tuple)
+    batch_size = len(batch)
+    if isinstance(batch[0][0], tuple):
+        # This branch 'deinterleaves' and flattens tuples of input tensors into one tensor ordered by position
+        # such that all tuple of position n will end up in a torch.split(tensor, batch_size) in nth position
+        inner_tuple_size = len(batch[0][0])
+        flattened_batch_size = batch_size * inner_tuple_size
+        targets = torch.zeros(flattened_batch_size, dtype=torch.int64)
+        tensor = torch.zeros((flattened_batch_size, *batch[0][0][0].shape), dtype=torch.uint8)
+        for i in range(batch_size):
+            assert len(batch[i][0]) == inner_tuple_size  # all input tensor tuples must be same length
+            for j in range(inner_tuple_size):
+                targets[i + j * batch_size] = batch[i][1]
+                tensor[i + j * batch_size] += torch.from_numpy(batch[i][0][j])
+        return tensor, targets
+    elif isinstance(batch[0][0], np.ndarray):
+        targets = torch.tensor([b[1] for b in batch], dtype=torch.int64)
+        assert len(targets) == batch_size
+        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+        for i in range(batch_size):
+            tensor[i] += torch.from_numpy(batch[i][0])
+        return tensor, targets
+    elif isinstance(batch[0][0], torch.Tensor):
+        targets = torch.tensor([b[1] for b in batch], dtype=torch.int64)
+        assert len(targets) == batch_size
+        tensor = torch.zeros((batch_size, *batch[0][0].shape), dtype=torch.uint8)
+        for i in range(batch_size):
+            tensor[i].copy_(batch[i][0])
+        return tensor, targets
+    else:
+        assert False
+
+
+class PrefetchLoader:
+
+    def __init__(self,
+                 loader,
+                 mean=IMAGENET_DEFAULT_MEAN,
+                 std=IMAGENET_DEFAULT_STD,
+                 fp16=False,
+                 re_prob=0.,
+                 re_mode='const',
+                 re_count=1,
+                 re_num_splits=0):
+        self.loader = loader
+        self.mean = torch.tensor([x * 255 for x in mean]).view(1, 3, 1, 1)
+        self.std = torch.tensor([x * 255 for x in std]).view(1, 3, 1, 1)
+        self.fp16 = fp16
+        if fp16:
+            self.mean = self.mean.half()
+            self.std = self.std.half()
+        if re_prob > 0.:
+            self.random_erasing = RandomErasing(
+                probability=re_prob, mode=re_mode, max_count=re_count, num_splits=re_num_splits)
+        else:
+            self.random_erasing = None
+
+    def __iter__(self):
+        first = True
+
+        for next_input, next_target in self.loader:
+            if self.fp16:
+                next_input = next_input.half().sub_(self.mean).div_(self.std)
+            else:
+                next_input = next_input.float().sub_(self.mean).div_(self.std)
+            if self.random_erasing is not None:
+                next_input = self.random_erasing(next_input)
+
+            if not first:
+                yield input, target
+            else:
+                first = False
+
+            input = next_input
+            target = next_target
+
+        yield input, target
+
+    def __len__(self):
+        return len(self.loader)
+
+    @property
+    def sampler(self):
+        return self.loader.sampler
+
+    @property
+    def dataset(self):
+        return self.loader.dataset
+
+    @property
+    def mixup_enabled(self):
+        if isinstance(self.loader.collate_fn, FastCollateMixup):
+            return self.loader.collate_fn.mixup_enabled
+        else:
+            return False
+
+    @mixup_enabled.setter
+    def mixup_enabled(self, x):
+        if isinstance(self.loader.collate_fn, FastCollateMixup):
+            self.loader.collate_fn.mixup_enabled = x
+
+
+def create_loader(
+        dataset,
+        input_size,
+        batch_size,
+        is_training=False,
+        use_prefetcher=True,
+        no_aug=False,
+        re_prob=0.,
+        re_mode='const',
+        re_count=1,
+        re_split=False,
+        scale=None,
+        ratio=None,
+        hflip=0.5,
+        vflip=0.,
+        color_jitter=0.4,
+        auto_augment=None,
+        num_aug_splits=0,
+        interpolation='bilinear',
+        mean=IMAGENET_DEFAULT_MEAN,
+        std=IMAGENET_DEFAULT_STD,
+        num_workers=1,
+        distributed=False,
+        crop_pct=None,
+        collate_fn=None,
+        pin_memory=False,
+        fp16=False,
+        tf_preprocessing=False,
+        use_multi_epochs_loader=False
+):
+    re_num_splits = 0
+    if re_split:
+        # apply RE to second half of batch if no aug split otherwise line up with aug split
+        re_num_splits = num_aug_splits or 2
+    dataset.transform = create_transform(
+        input_size,
+        is_training=is_training,
+        use_prefetcher=use_prefetcher,
+        no_aug=no_aug,
+        scale=scale,
+        ratio=ratio,
+        hflip=hflip,
+        vflip=vflip,
+        color_jitter=color_jitter,
+        auto_augment=auto_augment,
+        interpolation=interpolation,
+        mean=mean,
+        std=std,
+        crop_pct=crop_pct,
+        tf_preprocessing=tf_preprocessing,
+        re_prob=re_prob,
+        re_mode=re_mode,
+        re_count=re_count,
+        re_num_splits=re_num_splits,
+        separate=num_aug_splits > 0,
+    )
+
+    sampler = None
+    if distributed:
+        if is_training:
+            sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        else:
+            # This will add extra duplicate entries to result in equal num
+            # of samples per-process, will slightly alter validation results
+            sampler = OrderedDistributedSampler(dataset)
+
+    if collate_fn is None:
+        collate_fn = fast_collate if use_prefetcher else torch.utils.data.dataloader.default_collate
+
+    loader_class = torch.utils.data.DataLoader
+
+    if use_multi_epochs_loader:
+        loader_class = MultiEpochsDataLoader
+
+    loader = loader_class(
+        dataset,
+        batch_size=batch_size,
+        shuffle=sampler is None and is_training,
+        num_workers=num_workers,
+        sampler=sampler,
+        collate_fn=collate_fn,
+        pin_memory=pin_memory,
+        drop_last=is_training,
+    )
+    if use_prefetcher:
+        prefetch_re_prob = re_prob if is_training and not no_aug else 0.
+        loader = PrefetchLoader(
+            loader,
+            mean=mean,
+            std=std,
+            fp16=fp16,
+            re_prob=prefetch_re_prob,
+            re_mode=re_mode,
+            re_count=re_count,
+            re_num_splits=re_num_splits
+        )
+
+    return loader
+
+
+class MultiEpochsDataLoader(torch.utils.data.DataLoader):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._DataLoader__initialized = False
+        self.batch_sampler = _RepeatSampler(self.batch_sampler)
+        self._DataLoader__initialized = True
+        self.iterator = super().__iter__()
+
+    def __len__(self):
+        return len(self.batch_sampler.sampler)
+
+    def __iter__(self):
+        for i in range(len(self)):
+            yield next(self.iterator)
+
+
+class _RepeatSampler(object):
+    """ Sampler that repeats forever.
+
+    Args:
+        sampler (Sampler)
+    """
+
+    def __init__(self, sampler):
+        self.sampler = sampler
+
+    def __iter__(self):
+        while True:
+            yield from iter(self.sampler)
diff --git a/timm/models/__init__.py b/timm/models/__init__.py
index 96719e5e..0f747535 100644
--- a/timm/models/__init__.py
+++ b/timm/models/__init__.py
@@ -24,6 +24,10 @@ from .tresnet import *
 from .vovnet import *
 from .xception import *
 from .xception_aligned import *
+from .frostnet import *
+from .quantization.efficientnet import *
+from .quantization.mobilenetv3 import *
+from .quantization.rexnet import *
 
 from .factory import create_model
 from .helpers import load_checkpoint, resume_checkpoint
diff --git a/timm/models/quantization/__init__.py b/timm/models/quantization/__init__.py
new file mode 100644
index 00000000..03494d18
--- /dev/null
+++ b/timm/models/quantization/__init__.py
@@ -0,0 +1,10 @@
+from .efficientnet import *
+from .mobilenetv3 import *
+from .rexnet import *
+
+from timm.models.factory import create_model
+from timm.models.helpers import load_checkpoint, resume_checkpoint
+from .layers import TestTimePoolHead, apply_test_time_pool
+from .layers import convert_splitbn_model
+from .layers import is_scriptable, is_exportable, set_scriptable, set_exportable, is_no_jit, set_no_jit
+from timm.models.registry import *
diff --git a/timm/models/quantization/efficientnet.py b/timm/models/quantization/efficientnet.py
new file mode 100644
index 00000000..58705b2c
--- /dev/null
+++ b/timm/models/quantization/efficientnet.py
@@ -0,0 +1,1743 @@
+""" PyTorch EfficientNet Family
+
+An implementation of EfficienNet that covers variety of related models with efficient architectures:
+
+* EfficientNet (B0-B8, L2 + Tensorflow pretrained AutoAug/RandAug/AdvProp/NoisyStudent weight ports)
+  - EfficientNet: Rethinking Model Scaling for CNNs - https://arxiv.org/abs/1905.11946
+  - CondConv: Conditionally Parameterized Convolutions for Efficient Inference - https://arxiv.org/abs/1904.04971
+  - Adversarial Examples Improve Image Recognition - https://arxiv.org/abs/1911.09665
+  - Self-training with Noisy Student improves ImageNet classification - https://arxiv.org/abs/1911.04252
+
+* MixNet (Small, Medium, and Large)
+  - MixConv: Mixed Depthwise Convolutional Kernels - https://arxiv.org/abs/1907.09595
+
+* MNasNet B1, A1 (SE), Small
+  - MnasNet: Platform-Aware Neural Architecture Search for Mobile - https://arxiv.org/abs/1807.11626
+
+* FBNet-C
+  - FBNet: Hardware-Aware Efficient ConvNet Design via Differentiable NAS - https://arxiv.org/abs/1812.03443
+
+* Single-Path NAS Pixel1
+  - Single-Path NAS: Designing Hardware-Efficient ConvNets - https://arxiv.org/abs/1904.02877
+
+* And likely more...
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import List
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
+from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
+from timm.models.features import FeatureInfo, FeatureHooks
+from timm.models.helpers import build_model_with_cfg
+from .layers import create_conv2d, create_classifier
+from timm.models.registry import register_model
+from .efficientnet_blocks import ConvBnAct, SqueezeExcite, DepthwiseSeparableConv, InvertedResidual
+__all__ = ['EfficientNet']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv_stem', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'mnasnet_050': _cfg(url=''),
+    'mnasnet_075': _cfg(url=''),
+    'mnasnet_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_b1-74cb7081.pth'),
+    'mnasnet_140': _cfg(url=''),
+
+    'semnasnet_050': _cfg(url=''),
+    'semnasnet_075': _cfg(url=''),
+    'semnasnet_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mnasnet_a1-d9418771.pth'),
+    'semnasnet_140': _cfg(url=''),
+    'mnasnet_small': _cfg(url=''),
+
+    'mobilenetv2_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_100_ra-b33bc2c4.pth'),
+    'mobilenetv2_110d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_110d_ra-77090ade.pth'),
+    'mobilenetv2_120d': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_120d_ra-5987e2ed.pth'),
+    'mobilenetv2_140': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv2_140_ra-21a4e913.pth'),
+
+    'fbnetc_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/fbnetc_100-c345b898.pth',
+        interpolation='bilinear'),
+    'spnasnet_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/spnasnet_100-048bc3f4.pth',
+        interpolation='bilinear'),
+
+    'efficientnet_b0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b0_ra-3dd342df.pth'),
+    'efficientnet_b1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b1-533bc792.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8)),
+    'efficientnet_b2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9)),
+    'efficientnet_b2a': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b2_ra-bcdf34b7.pth',
+        input_size=(3, 288, 288), pool_size=(9, 9), crop_pct=1.0),
+    'efficientnet_b3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'efficientnet_b3a': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_b3_ra2-cf984f9c.pth',
+        input_size=(3, 320, 320), pool_size=(10, 10), crop_pct=1.0),
+    'efficientnet_b4': _cfg(
+        url='', input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'efficientnet_b5': _cfg(
+        url='', input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'efficientnet_b6': _cfg(
+        url='', input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'efficientnet_b7': _cfg(
+        url='', input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'efficientnet_b8': _cfg(
+        url='', input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
+    'efficientnet_l2': _cfg(
+        url='', input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.961),
+
+    'efficientnet_es': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_es_ra-f111e99c.pth'),
+    'efficientnet_em': _cfg(
+        url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'efficientnet_el': _cfg(
+        url='', input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+
+    'efficientnet_cc_b0_4e': _cfg(url=''),
+    'efficientnet_cc_b0_8e': _cfg(url=''),
+    'efficientnet_cc_b1_8e': _cfg(url='', input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+
+    'efficientnet_lite0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/efficientnet_lite0_ra-37913777.pth'),
+    'efficientnet_lite1': _cfg(
+        url='',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'efficientnet_lite2': _cfg(
+        url='',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'efficientnet_lite3': _cfg(
+        url='',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'efficientnet_lite4': _cfg(
+        url='', input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+
+    'efficientnet_b1_pruned': _cfg(
+        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45403/outputs/effnetb1_pruned_9ebb3fe6.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'efficientnet_b2_pruned': _cfg(
+        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45403/outputs/effnetb2_pruned_203f55bc.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'efficientnet_b3_pruned': _cfg(
+        url='https://imvl-automl-sh.oss-cn-shanghai.aliyuncs.com/darts/hyperml/hyperml/job_45403/outputs/effnetb3_pruned_5abcc29f.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+
+    'tf_efficientnet_b0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_aa-827b6e33.pth',
+        input_size=(3, 224, 224)),
+    'tf_efficientnet_b1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_aa-ea7a6ee0.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_b2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_aa-60c94f97.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'tf_efficientnet_b3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_aa-84b4657e.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_b4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_aa-818f208c.pth',
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'tf_efficientnet_b5': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ra-9a3e5369.pth',
+        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'tf_efficientnet_b6': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_aa-80ba17e4.pth',
+        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'tf_efficientnet_b7': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ra-6c08e654.pth',
+        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_b8': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ra-572d5dd9.pth',
+        input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
+
+    'tf_efficientnet_b0_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ap-f262efe1.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD, input_size=(3, 224, 224)),
+    'tf_efficientnet_b1_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ap-44ef0a3d.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_b2_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ap-2f8e7636.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'tf_efficientnet_b3_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ap-aad25bdd.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_b4_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ap-dedb23e6.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'tf_efficientnet_b5_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ap-9e82fae8.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'tf_efficientnet_b6_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ap-4ffb161f.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'tf_efficientnet_b7_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ap-ddb28fec.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_b8_ap': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b8_ap-00e169fa.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 672, 672), pool_size=(21, 21), crop_pct=0.954),
+
+    'tf_efficientnet_b0_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b0_ns-c0e6a31c.pth',
+        input_size=(3, 224, 224)),
+    'tf_efficientnet_b1_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b1_ns-99dd0c41.pth',
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_b2_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b2_ns-00306e48.pth',
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890),
+    'tf_efficientnet_b3_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b3_ns-9d44bf68.pth',
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+    'tf_efficientnet_b4_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b4_ns-d6313a46.pth',
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.922),
+    'tf_efficientnet_b5_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b5_ns-6f26d0cf.pth',
+        input_size=(3, 456, 456), pool_size=(15, 15), crop_pct=0.934),
+    'tf_efficientnet_b6_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b6_ns-51548356.pth',
+        input_size=(3, 528, 528), pool_size=(17, 17), crop_pct=0.942),
+    'tf_efficientnet_b7_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_b7_ns-1dbc32de.pth',
+        input_size=(3, 600, 600), pool_size=(19, 19), crop_pct=0.949),
+    'tf_efficientnet_l2_ns_475': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns_475-bebbd00a.pth',
+        input_size=(3, 475, 475), pool_size=(15, 15), crop_pct=0.936),
+    'tf_efficientnet_l2_ns': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_l2_ns-df73bb44.pth',
+        input_size=(3, 800, 800), pool_size=(25, 25), crop_pct=0.96),
+
+    'tf_efficientnet_es': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_es-ca1afbfe.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 224, 224), ),
+    'tf_efficientnet_em': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_em-e78cfe58.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+    'tf_efficientnet_el': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_el-5143854e.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904),
+
+    'tf_efficientnet_cc_b0_4e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_4e-4362b6b2.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_efficientnet_cc_b0_8e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b0_8e-66184a25.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_efficientnet_cc_b1_8e': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_cc_b1_8e-f7c79ae1.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD,
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882),
+
+    'tf_efficientnet_lite0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite0-0aa007d2.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
+    ),
+    'tf_efficientnet_lite1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite1-bde8b488.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 240, 240), pool_size=(8, 8), crop_pct=0.882,
+        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
+    ),
+    'tf_efficientnet_lite2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite2-dcccb7df.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 260, 260), pool_size=(9, 9), crop_pct=0.890,
+        interpolation='bicubic',  # should be bilinear but bicubic better match for TF bilinear at low res
+    ),
+    'tf_efficientnet_lite3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite3-b733e338.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 300, 300), pool_size=(10, 10), crop_pct=0.904, interpolation='bilinear'),
+    'tf_efficientnet_lite4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_efficientnet_lite4-741542c3.pth',
+        mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5),
+        input_size=(3, 380, 380), pool_size=(12, 12), crop_pct=0.920, interpolation='bilinear'),
+
+    'mixnet_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_s-a907afbc.pth'),
+    'mixnet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_m-4647fc68.pth'),
+    'mixnet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_l-5a9a2ed8.pth'),
+    'mixnet_xl': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mixnet_xl_ra-aac3c00c.pth'),
+    'mixnet_xxl': _cfg(),
+
+    'tf_mixnet_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_s-89d3354b.pth'),
+    'tf_mixnet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_m-0f4d8805.pth'),
+    'tf_mixnet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mixnet_l-6c92e0c8.pth'),
+}
+
+_DEBUG = False
+
+
+class EfficientNet(nn.Module):
+    """ (Generic) EfficientNet
+
+    A flexible and performant PyTorch implementation of efficient network architectures, including:
+      * EfficientNet B0-B8, L2
+      * EfficientNet-EdgeTPU
+      * EfficientNet-CondConv
+      * MixNet S, M, L, XL
+      * MnasNet A1, B1, and small
+      * FBNet C
+      * Single-Path NAS Pixel1
+
+    """
+
+    def __init__(self, block_args, num_classes=1000, num_features=1280, in_chans=3, stem_size=32,
+                 channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+                 output_stride=32, pad_type='', fix_stem=False, act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
+        super(EfficientNet, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        #temporary fix to support tf_models with torch quantization
+        pad_type=''
+        self.num_classes = num_classes
+        self.num_features = num_features
+        self.drop_rate = drop_rate
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = builder.features
+        head_chs = builder.in_chs
+
+        # Head + Pooling
+        self.conv_head = create_conv2d(head_chs, self.num_features, 1, padding=pad_type)
+        self.bn2 = norm_layer(self.num_features, **norm_kwargs)
+        self.act2 = act_layer(inplace=True)
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+        
+        # Quantization Stubs
+        self.quant = torch.quantization.QuantStub()
+        self.dequant = torch.quantization.DeQuantStub() 
+        
+        efficientnet_init_weights(self)
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1, self.act1]
+        layers.extend(self.blocks)
+        layers.extend([self.conv_head, self.bn2, self.act2, self.global_pool])
+        layers.extend([nn.Dropout(self.drop_rate), self.classifier])
+        return nn.Sequential(*layers)
+
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.global_pool, self.classifier = create_classifier(
+            self.num_features, self.num_classes, pool_type=global_pool)
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.blocks(x)
+        x = self.conv_head(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+        
+        return x
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.forward_features(x)
+        x = self.global_pool(x)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        x = self.classifier(x)
+        x = self.dequant(x)
+        return x
+    
+    def fuse_model(self):
+        modules_to_fuse = [['conv_stem','bn1'], ['conv_head','bn2']]
+        if type(self.act1) == nn.ReLU:
+            modules_to_fuse[0].append('act1')
+        if type(self.act2) == nn.ReLU:
+            modules_to_fuse[1].append('act2')            
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)        
+        for m in self.modules():
+            if type(m) in [ConvBnAct, SqueezeExcite, DepthwiseSeparableConv, InvertedResidual]:
+                m.fuse_module()
+
+class EfficientNetFeatures(nn.Module):
+    """ EfficientNet Feature Extractor
+
+    A work-in-progress feature extraction module for EfficientNet, to use as a backbone for segmentation
+    and object detection models.
+    """
+
+    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck',
+                 in_chans=3, stem_size=32, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+                 output_stride=32, pad_type='', fix_stem=False, act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+        super(EfficientNetFeatures, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        self.drop_rate = drop_rate
+
+        # Stem
+        if not fix_stem:
+            stem_size = round_channels(stem_size, channel_multiplier, channel_divisor, channel_min)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            channel_multiplier, channel_divisor, channel_min, output_stride, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = FeatureInfo(builder.features, out_indices)
+        self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices}
+
+        efficientnet_init_weights(self)
+
+        # Register feature extraction hooks with FeatureHooks helper
+        self.feature_hooks = None
+        if feature_location != 'bottleneck':
+            hooks = self.feature_info.get_dicts(keys=('module', 'hook_type'))
+            self.feature_hooks = FeatureHooks(hooks, self.named_modules())
+
+    def forward(self, x) -> List[torch.Tensor]:
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        if self.feature_hooks is None:
+            features = []
+            if 0 in self._stage_out_idx:
+                features.append(x)  # add stem out
+            for i, b in enumerate(self.blocks):
+                x = b(x)
+                if i + 1 in self._stage_out_idx:
+                    features.append(x)
+            return features
+        else:
+            self.blocks(x)
+            out = self.feature_hooks.get_output(x.device)
+            return list(out.values())
+
+
+def _create_effnet(model_kwargs, variant, pretrained=False):
+    if model_kwargs.pop('features_only', False):
+        load_strict = False
+        model_kwargs.pop('num_classes', 0)
+        model_kwargs.pop('num_features', 0)
+        model_kwargs.pop('head_conv', None)
+        model_cls = EfficientNetFeatures
+    else:
+        load_strict = True
+        model_cls = EfficientNet
+    return build_model_with_cfg(
+        model_cls, variant, pretrained, default_cfg=default_cfgs[variant],
+        pretrained_strict=load_strict, **model_kwargs)
+
+
+def _gen_mnasnet_a1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-a1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r2_k3_s2_e6_c24'],
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40_se0.25'],
+        # stage 3, 28x28 in
+        ['ir_r4_k3_s2_e6_c80'],
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c112_se0.25'],
+        # stage 5, 14x14in
+        ['ir_r3_k5_s2_e6_c160_se0.25'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_mnasnet_b1(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-b1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r3_k3_s2_e3_c24'],
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40'],
+        # stage 3, 28x28 in
+        ['ir_r3_k5_s2_e6_c80'],
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c96'],
+        # stage 5, 14x14in
+        ['ir_r4_k5_s2_e6_c192'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320_noskip']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_mnasnet_small(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a mnasnet-b1 model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet
+    Paper: https://arxiv.org/pdf/1807.11626.pdf.
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_c8'],
+        ['ir_r1_k3_s2_e3_c16'],
+        ['ir_r2_k3_s2_e6_c16'],
+        ['ir_r4_k5_s2_e6_c32_se0.25'],
+        ['ir_r3_k3_s1_e6_c32_se0.25'],
+        ['ir_r3_k5_s2_e6_c88_se0.25'],
+        ['ir_r1_k3_s1_e6_c144']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=8,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs,variant, pretrained)
+    return model
+
+
+def _gen_mobilenet_v2(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, fix_stem_head=False, pretrained=False, **kwargs):
+    """ Generate MobileNet-V2 network
+    Ref impl: https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet_v2.py
+    Paper: https://arxiv.org/abs/1801.04381
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r3_k3_s2_e6_c32'],
+        ['ir_r4_k3_s2_e6_c64'],
+        ['ir_r3_k3_s1_e6_c96'],
+        ['ir_r3_k3_s2_e6_c160'],
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier=depth_multiplier, fix_first_last=fix_stem_head),
+        num_features=1280 if fix_stem_head else round_channels(1280, channel_multiplier, 8, None),
+        stem_size=32,
+        fix_stem=fix_stem_head,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_fbnetc(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """ FBNet-C
+
+        Paper: https://arxiv.org/abs/1812.03443
+        Ref Impl: https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_modeldef.py
+
+        NOTE: the impl above does not relate to the 'C' variant here, that was derived from paper,
+        it was used to confirm some building block details
+    """
+    arch_def = [
+        ['ir_r1_k3_s1_e1_c16'],
+        ['ir_r1_k3_s2_e6_c24', 'ir_r2_k3_s1_e1_c24'],
+        ['ir_r1_k5_s2_e6_c32', 'ir_r1_k5_s1_e3_c32', 'ir_r1_k5_s1_e6_c32', 'ir_r1_k3_s1_e6_c32'],
+        ['ir_r1_k5_s2_e6_c64', 'ir_r1_k5_s1_e3_c64', 'ir_r2_k5_s1_e6_c64'],
+        ['ir_r3_k5_s1_e6_c112', 'ir_r1_k5_s1_e3_c112'],
+        ['ir_r4_k5_s2_e6_c184'],
+        ['ir_r1_k3_s1_e6_c352'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=16,
+        num_features=1984,  # paper suggests this, but is not 100% clear
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_spnasnet(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates the Single-Path NAS model from search targeted for Pixel1 phone.
+
+    Paper: https://arxiv.org/abs/1904.02877
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_c16_noskip'],
+        # stage 1, 112x112 in
+        ['ir_r3_k3_s2_e3_c24'],
+        # stage 2, 56x56 in
+        ['ir_r1_k5_s2_e6_c40', 'ir_r3_k3_s1_e3_c40'],
+        # stage 3, 28x28 in
+        ['ir_r1_k5_s2_e6_c80', 'ir_r3_k3_s1_e3_c80'],
+        # stage 4, 14x14in
+        ['ir_r1_k5_s1_e6_c96', 'ir_r3_k5_s1_e3_c96'],
+        # stage 5, 14x14in
+        ['ir_r4_k5_s2_e6_c192'],
+        # stage 6, 7x7 in
+        ['ir_r1_k3_s1_e6_c320_noskip']
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_efficientnet(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates an EfficientNet model.
+
+    Ref impl: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+    Paper: https://arxiv.org/abs/1905.11946
+
+    EfficientNet params
+    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+    'efficientnet-b0': (1.0, 1.0, 224, 0.2),
+    'efficientnet-b1': (1.0, 1.1, 240, 0.2),
+    'efficientnet-b2': (1.1, 1.2, 260, 0.3),
+    'efficientnet-b3': (1.2, 1.4, 300, 0.3),
+    'efficientnet-b4': (1.4, 1.8, 380, 0.4),
+    'efficientnet-b5': (1.6, 2.2, 456, 0.4),
+    'efficientnet-b6': (1.8, 2.6, 528, 0.5),
+    'efficientnet-b7': (2.0, 3.1, 600, 0.5),
+    'efficientnet-b8': (2.2, 3.6, 672, 0.5),
+    'efficientnet-l2': (4.3, 5.3, 800, 0.5),
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer
+      depth_multiplier: multiplier to number of repeats per stage
+
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'],
+        ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'],
+        ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25'],
+        ['ir_r4_k5_s2_e6_c192_se0.25'],
+        ['ir_r1_k3_s1_e6_c320_se0.25'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=round_channels(1280, channel_multiplier, 8, None),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs,
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_efficientnet_edge(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """ Creates an EfficientNet-EdgeTPU model
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/edgetpu
+    """
+
+    arch_def = [
+        # NOTE `fc` is present to override a mismatch between stem channels and in chs not
+        # present in other models
+        ['er_r1_k3_s1_e4_c24_fc24_noskip'],
+        ['er_r2_k3_s2_e8_c32'],
+        ['er_r4_k3_s2_e8_c48'],
+        ['ir_r5_k5_s2_e8_c96'],
+        ['ir_r4_k5_s1_e8_c144'],
+        ['ir_r2_k5_s2_e8_c192'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier),
+        num_features=round_channels(1280, channel_multiplier, 8, None),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=resolve_act_layer(kwargs, 'relu'),
+        **kwargs,
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_efficientnet_condconv(
+        variant, channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=1, pretrained=False, **kwargs):
+    """Creates an EfficientNet-CondConv model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/condconv
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16_se0.25'],
+        ['ir_r2_k3_s2_e6_c24_se0.25'],
+        ['ir_r2_k5_s2_e6_c40_se0.25'],
+        ['ir_r3_k3_s2_e6_c80_se0.25'],
+        ['ir_r3_k5_s1_e6_c112_se0.25_cc4'],
+        ['ir_r4_k5_s2_e6_c192_se0.25_cc4'],
+        ['ir_r1_k3_s1_e6_c320_se0.25_cc4'],
+    ]
+    # NOTE unlike official impl, this one uses `cc<x>` option where x is the base number of experts for each stage and
+    # the expert_multiplier increases that on a per-model basis as with depth/channel multipliers
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, experts_multiplier=experts_multiplier),
+        num_features=round_channels(1280, channel_multiplier, 8, None),
+        stem_size=32,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=resolve_act_layer(kwargs, 'swish'),
+        **kwargs,
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_efficientnet_lite(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates an EfficientNet-Lite model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet/lite
+    Paper: https://arxiv.org/abs/1905.11946
+
+    EfficientNet params
+    name: (channel_multiplier, depth_multiplier, resolution, dropout_rate)
+      'efficientnet-lite0': (1.0, 1.0, 224, 0.2),
+      'efficientnet-lite1': (1.0, 1.1, 240, 0.2),
+      'efficientnet-lite2': (1.1, 1.2, 260, 0.3),
+      'efficientnet-lite3': (1.2, 1.4, 280, 0.3),
+      'efficientnet-lite4': (1.4, 1.8, 300, 0.3),
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer
+      depth_multiplier: multiplier to number of repeats per stage
+    """
+    arch_def = [
+        ['ds_r1_k3_s1_e1_c16'],
+        ['ir_r2_k3_s2_e6_c24'],
+        ['ir_r2_k5_s2_e6_c40'],
+        ['ir_r3_k3_s2_e6_c80'],
+        ['ir_r3_k5_s1_e6_c112'],
+        ['ir_r4_k5_s2_e6_c192'],
+        ['ir_r1_k3_s1_e6_c320'],
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, fix_first_last=True),
+        num_features=1280,
+        stem_size=32,
+        fix_stem=True,
+        channel_multiplier=channel_multiplier,
+        act_layer=resolve_act_layer(kwargs, 'relu6'),
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs,
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_mixnet_s(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MixNet Small model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+    Paper: https://arxiv.org/abs/1907.09595
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3_a1.1_p1.1_s2_e6_c24', 'ir_r1_k3_a1.1_p1.1_s1_e3_c24'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r1_k3.5.7_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
+        # stage 3, 28x28 in
+        ['ir_r1_k3.5.7_p1.1_s2_e6_c80_se0.25_nsw', 'ir_r2_k3.5_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
+        # stage 4, 14x14in
+        ['ir_r1_k3.5.7_a1.1_p1.1_s1_e6_c120_se0.5_nsw', 'ir_r2_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
+        # stage 5, 14x14in
+        ['ir_r1_k3.5.7.9.11_s2_e6_c200_se0.5_nsw', 'ir_r2_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=1536,
+        stem_size=16,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_mixnet_m(variant, channel_multiplier=1.0, depth_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MixNet Medium-Large model.
+
+    Ref impl: https://github.com/tensorflow/tpu/tree/master/models/official/mnasnet/mixnet
+    Paper: https://arxiv.org/abs/1907.09595
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c24'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3.5.7_a1.1_p1.1_s2_e6_c32', 'ir_r1_k3_a1.1_p1.1_s1_e3_c32'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r1_k3.5.7.9_s2_e6_c40_se0.5_nsw', 'ir_r3_k3.5_a1.1_p1.1_s1_e6_c40_se0.5_nsw'],  # swish
+        # stage 3, 28x28 in
+        ['ir_r1_k3.5.7_s2_e6_c80_se0.25_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e6_c80_se0.25_nsw'],  # swish
+        # stage 4, 14x14in
+        ['ir_r1_k3_s1_e6_c120_se0.5_nsw', 'ir_r3_k3.5.7.9_a1.1_p1.1_s1_e3_c120_se0.5_nsw'],  # swish
+        # stage 5, 14x14in
+        ['ir_r1_k3.5.7.9_s2_e6_c200_se0.5_nsw', 'ir_r3_k3.5.7.9_p1.1_s1_e6_c200_se0.5_nsw'],  # swish
+        # 7x7
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def, depth_multiplier, depth_trunc='round'),
+        num_features=1536,
+        stem_size=24,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        **kwargs
+    )
+    model = _create_effnet(model_kwargs, variant, pretrained)
+    return model
+
+
+@register_model
+def quant_mnasnet_050(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 0.5. """
+    model = _gen_mnasnet_b1('mnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mnasnet_075(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 0.75. """
+    model = _gen_mnasnet_b1('mnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mnasnet_100(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 1.0. """
+    model = _gen_mnasnet_b1('mnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mnasnet_b1(pretrained=False, **kwargs):
+    """ MNASNet B1, depth multiplier of 1.0. """
+    return mnasnet_100(pretrained, **kwargs)
+
+
+@register_model
+def quant_mnasnet_140(pretrained=False, **kwargs):
+    """ MNASNet B1,  depth multiplier of 1.4 """
+    model = _gen_mnasnet_b1('mnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_semnasnet_050(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 0.5 """
+    model = _gen_mnasnet_a1('semnasnet_050', 0.5, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_semnasnet_075(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE),  depth multiplier of 0.75. """
+    model = _gen_mnasnet_a1('semnasnet_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_semnasnet_100(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+    model = _gen_mnasnet_a1('semnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mnasnet_a1(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 1.0. """
+    return semnasnet_100(pretrained, **kwargs)
+
+
+@register_model
+def quant_semnasnet_140(pretrained=False, **kwargs):
+    """ MNASNet A1 (w/ SE), depth multiplier of 1.4. """
+    model = _gen_mnasnet_a1('semnasnet_140', 1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mnasnet_small(pretrained=False, **kwargs):
+    """ MNASNet Small,  depth multiplier of 1.0. """
+    model = _gen_mnasnet_small('mnasnet_small', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv2_100(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.0 channel multiplier """
+    model = _gen_mobilenet_v2('mobilenetv2_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv2_140(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.4 channel multiplier """
+    model = _gen_mobilenet_v2('mobilenetv2_140', 1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv2_110d(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.1 channel, 1.2 depth multipliers"""
+    model = _gen_mobilenet_v2(
+        'mobilenetv2_110d', 1.1, depth_multiplier=1.2, fix_stem_head=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv2_120d(pretrained=False, **kwargs):
+    """ MobileNet V2 w/ 1.2 channel, 1.4 depth multipliers """
+    model = _gen_mobilenet_v2(
+        'mobilenetv2_120d', 1.2, depth_multiplier=1.4, fix_stem_head=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_fbnetc_100(pretrained=False, **kwargs):
+    """ FBNet-C """
+    if pretrained:
+        # pretrained model trained with non-default BN epsilon
+        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    model = _gen_fbnetc('fbnetc_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_spnasnet_100(pretrained=False, **kwargs):
+    """ Single-Path NAS Pixel1"""
+    model = _gen_spnasnet('spnasnet_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b0(pretrained=False, **kwargs):
+    """ EfficientNet-B0 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b1(pretrained=False, **kwargs):
+    """ EfficientNet-B1 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b2(pretrained=False, **kwargs):
+    """ EfficientNet-B2 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b2a(pretrained=False, **kwargs):
+    """ EfficientNet-B2 @ 288x288 w/ 1.0 test crop"""
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b2a', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b3(pretrained=False, **kwargs):
+    """ EfficientNet-B3 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b3a(pretrained=False, **kwargs):
+    """ EfficientNet-B3 @ 320x320 w/ 1.0 test crop-pct """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b3a', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b4(pretrained=False, **kwargs):
+    """ EfficientNet-B4 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b5(pretrained=False, **kwargs):
+    """ EfficientNet-B5 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b6(pretrained=False, **kwargs):
+    """ EfficientNet-B6 """
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b7(pretrained=False, **kwargs):
+    """ EfficientNet-B7 """
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b8(pretrained=False, **kwargs):
+    """ EfficientNet-B8 """
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_l2(pretrained=False, **kwargs):
+    """ EfficientNet-L2."""
+    # NOTE for train, drop_rate should be 0.5, drop_path_rate should be 0.2
+    model = _gen_efficientnet(
+        'efficientnet_l2', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_es(pretrained=False, **kwargs):
+    """ EfficientNet-Edge Small. """
+    model = _gen_efficientnet_edge(
+        'efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_em(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Medium. """
+    model = _gen_efficientnet_edge(
+        'efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_el(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Large. """
+    model = _gen_efficientnet_edge(
+        'efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B1 w/ 8 Experts """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_condconv(
+        'efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_lite0(pretrained=False, **kwargs):
+    """ EfficientNet-Lite0 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_lite1(pretrained=False, **kwargs):
+    """ EfficientNet-Lite1 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_lite2(pretrained=False, **kwargs):
+    """ EfficientNet-Lite2 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_lite3(pretrained=False, **kwargs):
+    """ EfficientNet-Lite3 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_lite4(pretrained=False, **kwargs):
+    """ EfficientNet-Lite4 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    model = _gen_efficientnet_lite(
+        'efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b1_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-B1 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    variant = 'efficientnet_b1_pruned'
+    model = _gen_efficientnet(
+        variant, channel_multiplier=1.0, depth_multiplier=1.1, pruned=True, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b2_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-B2 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'efficientnet_b2_pruned', channel_multiplier=1.1, depth_multiplier=1.2, pruned=True,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_efficientnet_b3_pruned(pretrained=False, **kwargs):
+    """ EfficientNet-B3 Pruned. The pruning has been obtained using https://arxiv.org/pdf/2002.08258.pdf """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'efficientnet_b3_pruned', channel_multiplier=1.2, depth_multiplier=1.4, pruned=True,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b0(pretrained=False, **kwargs):
+    """ EfficientNet-B0. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b1(pretrained=False, **kwargs):
+    """ EfficientNet-B1. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b2(pretrained=False, **kwargs):
+    """ EfficientNet-B2. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b3(pretrained=False, **kwargs):
+    """ EfficientNet-B3. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b4(pretrained=False, **kwargs):
+    """ EfficientNet-B4. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b5(pretrained=False, **kwargs):
+    """ EfficientNet-B5. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b5', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b6(pretrained=False, **kwargs):
+    """ EfficientNet-B6. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b6', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b7(pretrained=False, **kwargs):
+    """ EfficientNet-B7. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b7', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b8(pretrained=False, **kwargs):
+    """ EfficientNet-B8. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b8', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b0_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B0 AdvProp. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b0_ap', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b1_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B1 AdvProp. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b1_ap', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b2_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B2 AdvProp. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b2_ap', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b3_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B3 AdvProp. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b3_ap', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b4_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B4 AdvProp. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b4_ap', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b5_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B5 AdvProp. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b5_ap', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b6_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B6 AdvProp. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b6_ap', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b7_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B7 AdvProp. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b7_ap', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b8_ap(pretrained=False, **kwargs):
+    """ EfficientNet-B8 AdvProp. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b8_ap', channel_multiplier=2.2, depth_multiplier=3.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b0_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B0 NoisyStudent. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b0_ns', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b1_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B1 NoisyStudent. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b1_ns', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b2_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B2 NoisyStudent. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b2_ns', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b3_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B3 NoisyStudent. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b3_ns', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b4_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B4 NoisyStudent. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b4_ns', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b5_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B5 NoisyStudent. Tensorflow compatible variant """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b5_ns', channel_multiplier=1.6, depth_multiplier=2.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b6_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B6 NoisyStudent. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b6_ns', channel_multiplier=1.8, depth_multiplier=2.6, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_b7_ns(pretrained=False, **kwargs):
+    """ EfficientNet-B7 NoisyStudent. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_b7_ns', channel_multiplier=2.0, depth_multiplier=3.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_l2_ns_475(pretrained=False, **kwargs):
+    """ EfficientNet-L2 NoisyStudent @ 475x475. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_l2_ns_475', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_l2_ns(pretrained=False, **kwargs):
+    """ EfficientNet-L2 NoisyStudent. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.5
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet(
+        'tf_efficientnet_l2_ns', channel_multiplier=4.3, depth_multiplier=5.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_es(pretrained=False, **kwargs):
+    """ EfficientNet-Edge Small. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_edge(
+        'tf_efficientnet_es', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_em(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Medium. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_edge(
+        'tf_efficientnet_em', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_el(pretrained=False, **kwargs):
+    """ EfficientNet-Edge-Large. Tensorflow compatible variant  """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_edge(
+        'tf_efficientnet_el', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_cc_b0_4e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 4 Experts. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b0_4e', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_cc_b0_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B0 w/ 8 Experts. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b0_8e', channel_multiplier=1.0, depth_multiplier=1.0, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_cc_b1_8e(pretrained=False, **kwargs):
+    """ EfficientNet-CondConv-B1 w/ 8 Experts. Tensorflow compatible variant """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_condconv(
+        'tf_efficientnet_cc_b1_8e', channel_multiplier=1.0, depth_multiplier=1.1, experts_multiplier=2,
+        pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_lite0(pretrained=False, **kwargs):
+    """ EfficientNet-Lite0 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite0', channel_multiplier=1.0, depth_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_lite1(pretrained=False, **kwargs):
+    """ EfficientNet-Lite1 """
+    # NOTE for train, drop_rate should be 0.2, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite1', channel_multiplier=1.0, depth_multiplier=1.1, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_lite2(pretrained=False, **kwargs):
+    """ EfficientNet-Lite2 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite2', channel_multiplier=1.1, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_lite3(pretrained=False, **kwargs):
+    """ EfficientNet-Lite3 """
+    # NOTE for train, drop_rate should be 0.3, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite3', channel_multiplier=1.2, depth_multiplier=1.4, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_efficientnet_lite4(pretrained=False, **kwargs):
+    """ EfficientNet-Lite4 """
+    # NOTE for train, drop_rate should be 0.4, drop_path_rate should be 0.2
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_efficientnet_lite(
+        'tf_efficientnet_lite4', channel_multiplier=1.4, depth_multiplier=1.8, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mixnet_s(pretrained=False, **kwargs):
+    """Creates a MixNet Small model.
+    """
+    model = _gen_mixnet_s(
+        'mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mixnet_m(pretrained=False, **kwargs):
+    """Creates a MixNet Medium model.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mixnet_l(pretrained=False, **kwargs):
+    """Creates a MixNet Large model.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mixnet_xl(pretrained=False, **kwargs):
+    """Creates a MixNet Extra-Large model.
+    Not a paper spec, experimental def by RW w/ depth scaling.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_xl', channel_multiplier=1.6, depth_multiplier=1.2, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mixnet_xxl(pretrained=False, **kwargs):
+    """Creates a MixNet Double Extra Large model.
+    Not a paper spec, experimental def by RW w/ depth scaling.
+    """
+    model = _gen_mixnet_m(
+        'mixnet_xxl', channel_multiplier=2.4, depth_multiplier=1.3, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mixnet_s(pretrained=False, **kwargs):
+    """Creates a MixNet Small model. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mixnet_s(
+        'tf_mixnet_s', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mixnet_m(pretrained=False, **kwargs):
+    """Creates a MixNet Medium model. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mixnet_m(
+        'tf_mixnet_m', channel_multiplier=1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mixnet_l(pretrained=False, **kwargs):
+    """Creates a MixNet Large model. Tensorflow compatible variant
+    """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mixnet_m(
+        'tf_mixnet_l', channel_multiplier=1.3, pretrained=pretrained, **kwargs)
+    return model
diff --git a/timm/models/quantization/efficientnet_blocks.py b/timm/models/quantization/efficientnet_blocks.py
new file mode 100644
index 00000000..119c72bf
--- /dev/null
+++ b/timm/models/quantization/efficientnet_blocks.py
@@ -0,0 +1,423 @@
+""" EfficientNet, MobileNetV3, etc Blocks
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from .layers import create_conv2d, drop_path, get_act_layer
+from .layers.activations import sigmoid, HardSigmoid
+
+# Defaults used for Google/Tensorflow training of mobile networks /w RMSprop as per
+# papers and TF reference implementations. PT momentum equiv for TF decay is (1 - TF decay)
+# NOTE: momentum varies btw .99 and .9997 depending on source
+# .99 in official TF TPU impl
+# .9997 (/w .999 in search space) for paper
+BN_MOMENTUM_TF_DEFAULT = 1 - 0.99
+BN_EPS_TF_DEFAULT = 1e-3
+_BN_ARGS_TF = dict(momentum=BN_MOMENTUM_TF_DEFAULT, eps=BN_EPS_TF_DEFAULT)
+
+
+def get_bn_args_tf():
+    return _BN_ARGS_TF.copy()
+
+
+def resolve_bn_args(kwargs):
+    bn_args = get_bn_args_tf() if kwargs.pop('bn_tf', False) else {}
+    bn_momentum = kwargs.pop('bn_momentum', None)
+    if bn_momentum is not None:
+        bn_args['momentum'] = bn_momentum
+    bn_eps = kwargs.pop('bn_eps', None)
+    if bn_eps is not None:
+        bn_args['eps'] = bn_eps
+    return bn_args
+
+
+_SE_ARGS_DEFAULT = dict(
+    gate_fn=sigmoid,
+    act_layer=None,
+    reduce_mid=False,
+    divisor=1)
+
+
+def resolve_se_args(kwargs, in_chs, act_layer=None):
+    se_kwargs = kwargs.copy() if kwargs is not None else {}
+    # fill in args that aren't specified with the defaults
+    for k, v in _SE_ARGS_DEFAULT.items():
+        se_kwargs.setdefault(k, v)
+    # some models, like MobilNetV3, calculate SE reduction chs from the containing block's mid_ch instead of in_ch
+    if not se_kwargs.pop('reduce_mid'):
+        se_kwargs['reduced_base_chs'] = in_chs
+    # act_layer override, if it remains None, the containing block's act_layer will be used
+    if se_kwargs['act_layer'] is None:
+        assert act_layer is not None
+        se_kwargs['act_layer'] = act_layer
+    return se_kwargs
+
+
+def resolve_act_layer(kwargs, default='relu'):
+    act_layer = kwargs.pop('act_layer', default)
+    if isinstance(act_layer, str):
+        act_layer = get_act_layer(act_layer)
+    return act_layer
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+def round_channels(channels, multiplier=1.0, divisor=8, channel_min=None):
+    """Round number of filters based on depth multiplier."""
+    if not multiplier:
+        return channels
+    channels *= multiplier
+    return make_divisible(channels, divisor, channel_min)
+
+
+class ChannelShuffle(nn.Module):
+    # FIXME haven't used yet
+    def __init__(self, groups):
+        super(ChannelShuffle, self).__init__()
+        self.groups = groups
+
+    def forward(self, x):
+        """Channel shuffle: [N,C,H,W] -> [N,g,C/g,H,W] -> [N,C/g,g,H,w] -> [N,C,H,W]"""
+        N, C, H, W = x.size()
+        g = self.groups
+        assert C % g == 0, "Incompatible group size {} for input channel {}".format(
+            g, C
+        )
+        return (
+            x.view(N, g, int(C / g), H, W)
+            .permute(0, 2, 1, 3, 4)
+            .contiguous()
+            .view(N, C, H, W)
+        )
+
+
+class SqueezeExcite(nn.Module):
+    def __init__(self, in_chs, se_ratio=0.25, reduced_base_chs=None,
+                 act_layer=nn.ReLU, gate_fn=sigmoid, divisor=1, **_):
+        super(SqueezeExcite, self).__init__()
+        self.gate_fn = gate_fn
+        reduced_chs = make_divisible((reduced_base_chs or in_chs) * se_ratio, divisor)
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_reduce = nn.Conv2d(in_chs, reduced_chs, 1, bias=True)
+        self.act1 = act_layer(inplace=True)
+        self.conv_expand = nn.Conv2d(reduced_chs, in_chs, 1, bias=True)
+        self.quant_mul = nn.quantized.FloatFunctional()
+        if gate_fn == HardSigmoid:
+            self.gate_fn = HardSigmoid()
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.conv_reduce(x_se)
+        x_se = self.act1(x_se)
+        x_se = self.conv_expand(x_se)
+        x = self.quant_mul.mul(x, self.gate_fn(x_se))
+        return x
+    def fuse_module(self):        
+        if type(self.act1) == nn.ReLU:
+            modules_to_fuse = ['conv_reduce','act1']       
+            torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)
+
+class ConvBnAct(nn.Module):
+    def __init__(self, in_chs, out_chs, kernel_size,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+        super(ConvBnAct, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.bn1 = norm_layer(out_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+    def feature_info(self, location):
+        if location == 'expansion':  # output of conv after act, same as block coutput
+            info = dict(module='act1', hook_type='forward', num_chs=self.conv.out_channels)
+        else:  # location == 'bottleneck', block output
+            info = dict(module='', hook_type='', num_chs=self.conv.out_channels)
+        return info
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        return x
+    def fuse_module(self):
+        modules_to_fuse = ['conv','bn1']   
+        if type(self.act1) == nn.ReLU:
+            modules_to_fuse.append('act1')        
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)
+
+class DepthwiseSeparableConv(nn.Module):
+    """ DepthwiseSeparable block
+    Used for DS convs in MobileNet-V1 and in the place of IR blocks that have no expansion
+    (factor of 1.0). This is an alternative to having a IR with an optional first pw conv.
+    """
+    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+                 pw_kernel_size=1, pw_act=False, se_ratio=0., se_kwargs=None,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0.):
+        super(DepthwiseSeparableConv, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        has_se = se_ratio is not None and se_ratio > 0.
+        self.has_residual = (stride == 1 and in_chs == out_chs) and not noskip
+        self.has_pw_act = pw_act  # activation after point-wise conv
+        self.drop_path_rate = drop_path_rate
+
+        self.conv_dw = create_conv2d(
+            in_chs, in_chs, dw_kernel_size, stride=stride, dilation=dilation, padding=pad_type, depthwise=True)
+        self.bn1 = norm_layer(in_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Squeeze-and-excitation
+        if has_se:
+            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+            self.se = SqueezeExcite(in_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None
+
+        self.conv_pw = create_conv2d(in_chs, out_chs, pw_kernel_size, padding=pad_type)
+        self.bn2 = norm_layer(out_chs, **norm_kwargs)
+        self.act2 = act_layer(inplace=True) if self.has_pw_act else nn.Identity()
+        self.skip_add = nn.quantized.FloatFunctional()
+        
+    def feature_info(self, location):
+        if location == 'expansion':  # after SE, input to PW
+            info = dict(module='conv_pw', hook_type='forward_pre', num_chs=self.conv_pw.in_channels)
+        else:  # location == 'bottleneck', block output
+            info = dict(module='', hook_type='', num_chs=self.conv_pw.out_channels)
+        return info
+
+    def forward(self, x):
+        residual = x
+
+        x = self.conv_dw(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        if self.se is not None:
+            x = self.se(x)
+
+        x = self.conv_pw(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        if self.has_residual:
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
+            x = self.skip_add.add(x, residual)
+        return x
+    def fuse_module(self):
+        modules_to_fuse = [['conv_dw','bn1'],['conv_pw','bn2']]   
+        if type(self.act1) == nn.ReLU:
+            modules_to_fuse[0].append('act1')     
+        if type(self.act2) == nn.ReLU:
+            modules_to_fuse[1].append('act2')              
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)
+
+class InvertedResidual(nn.Module):
+    """ Inverted residual block w/ optional SE and CondConv routing"""
+
+    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 conv_kwargs=None, drop_path_rate=0.):
+        super(InvertedResidual, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        conv_kwargs = conv_kwargs or {}
+        mid_chs = make_divisible(in_chs * exp_ratio)
+        has_se = se_ratio is not None and se_ratio > 0.
+        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+        self.drop_path_rate = drop_path_rate
+
+        # Point-wise expansion
+        self.conv_pw = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type, **conv_kwargs)
+        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Depth-wise convolution
+        self.conv_dw = create_conv2d(
+            mid_chs, mid_chs, dw_kernel_size, stride=stride, dilation=dilation,
+            padding=pad_type, depthwise=True, **conv_kwargs)
+        self.bn2 = norm_layer(mid_chs, **norm_kwargs)
+        self.act2 = act_layer(inplace=True)
+
+        # Squeeze-and-excitation
+        if has_se:
+            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None
+
+        # Point-wise linear projection
+        self.conv_pwl = create_conv2d(mid_chs, out_chs, pw_kernel_size, padding=pad_type, **conv_kwargs)
+        self.bn3 = norm_layer(out_chs, **norm_kwargs)
+        
+        self.skip_add = nn.quantized.FloatFunctional()
+    def feature_info(self, location):
+        if location == 'expansion':  # after SE, input to PWL
+            info = dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
+        else:  # location == 'bottleneck', block output
+            info = dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
+        return info
+
+    def forward(self, x):
+        residual = x
+
+        # Point-wise expansion
+        x = self.conv_pw(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        # Depth-wise convolution
+        x = self.conv_dw(x)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        # Squeeze-and-excitation
+        if self.se is not None:
+            x = self.se(x)
+
+        # Point-wise linear projection
+        x = self.conv_pwl(x)
+        x = self.bn3(x)
+
+        if self.has_residual:
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
+            x = self.skip_add.add(x, residual)
+
+        return x
+    def fuse_module(self):
+        modules_to_fuse = [['conv_pw','bn1'],['conv_dw','bn2'],['conv_pwl','bn3']]   
+        if type(self.act1) == nn.ReLU:
+            modules_to_fuse[0].append('act1')     
+        if type(self.act2) == nn.ReLU:
+            modules_to_fuse[1].append('act2')     
+            
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)
+
+class CondConvResidual(InvertedResidual):
+    """ Inverted residual block w/ CondConv routing"""
+
+    def __init__(self, in_chs, out_chs, dw_kernel_size=3,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False,
+                 exp_ratio=1.0, exp_kernel_size=1, pw_kernel_size=1,
+                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 num_experts=0, drop_path_rate=0.):
+
+        self.num_experts = num_experts
+        conv_kwargs = dict(num_experts=self.num_experts)
+
+        super(CondConvResidual, self).__init__(
+            in_chs, out_chs, dw_kernel_size=dw_kernel_size, stride=stride, dilation=dilation, pad_type=pad_type,
+            act_layer=act_layer, noskip=noskip, exp_ratio=exp_ratio, exp_kernel_size=exp_kernel_size,
+            pw_kernel_size=pw_kernel_size, se_ratio=se_ratio, se_kwargs=se_kwargs,
+            norm_layer=norm_layer, norm_kwargs=norm_kwargs, conv_kwargs=conv_kwargs,
+            drop_path_rate=drop_path_rate)
+
+        self.routing_fn = nn.Linear(in_chs, self.num_experts)
+
+    def forward(self, x):
+        residual = x
+
+        # CondConv routing
+        pooled_inputs = F.adaptive_avg_pool2d(x, 1).flatten(1)
+        routing_weights = torch.sigmoid(self.routing_fn(pooled_inputs))
+
+        # Point-wise expansion
+        x = self.conv_pw(x, routing_weights)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        # Depth-wise convolution
+        x = self.conv_dw(x, routing_weights)
+        x = self.bn2(x)
+        x = self.act2(x)
+
+        # Squeeze-and-excitation
+        if self.se is not None:
+            x = self.se(x)
+
+        # Point-wise linear projection
+        x = self.conv_pwl(x, routing_weights)
+        x = self.bn3(x)
+
+        if self.has_residual:
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
+            x += residual
+        return x
+
+
+class EdgeResidual(nn.Module):
+    """ Residual block with expansion convolution followed by pointwise-linear w/ stride"""
+
+    def __init__(self, in_chs, out_chs, exp_kernel_size=3, exp_ratio=1.0, fake_in_chs=0,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU, noskip=False, pw_kernel_size=1,
+                 se_ratio=0., se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 drop_path_rate=0.):
+        super(EdgeResidual, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        if fake_in_chs > 0:
+            mid_chs = make_divisible(fake_in_chs * exp_ratio)
+        else:
+            mid_chs = make_divisible(in_chs * exp_ratio)
+        has_se = se_ratio is not None and se_ratio > 0.
+        self.has_residual = (in_chs == out_chs and stride == 1) and not noskip
+        self.drop_path_rate = drop_path_rate
+
+        # Expansion convolution
+        self.conv_exp = create_conv2d(in_chs, mid_chs, exp_kernel_size, padding=pad_type)
+        self.bn1 = norm_layer(mid_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Squeeze-and-excitation
+        if has_se:
+            se_kwargs = resolve_se_args(se_kwargs, in_chs, act_layer)
+            self.se = SqueezeExcite(mid_chs, se_ratio=se_ratio, **se_kwargs)
+        else:
+            self.se = None
+
+        # Point-wise linear projection
+        self.conv_pwl = create_conv2d(
+            mid_chs, out_chs, pw_kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.bn2 = norm_layer(out_chs, **norm_kwargs)
+
+    def feature_info(self, location):
+        if location == 'expansion':  # after SE, before PWL
+            info = dict(module='conv_pwl', hook_type='forward_pre', num_chs=self.conv_pwl.in_channels)
+        else:  # location == 'bottleneck', block output
+            info = dict(module='', hook_type='', num_chs=self.conv_pwl.out_channels)
+        return info
+
+    def forward(self, x):
+        residual = x
+
+        # Expansion convolution
+        x = self.conv_exp(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+
+        # Squeeze-and-excitation
+        if self.se is not None:
+            x = self.se(x)
+
+        # Point-wise linear projection
+        x = self.conv_pwl(x)
+        x = self.bn2(x)
+
+        if self.has_residual:
+            if self.drop_path_rate > 0.:
+                x = drop_path(x, self.drop_path_rate, self.training)
+            x += residual
+
+        return x
diff --git a/timm/models/quantization/efficientnet_builder.py b/timm/models/quantization/efficientnet_builder.py
new file mode 100644
index 00000000..f670aa6c
--- /dev/null
+++ b/timm/models/quantization/efficientnet_builder.py
@@ -0,0 +1,414 @@
+""" EfficientNet, MobileNetV3, etc Builder
+
+Assembles EfficieNet and related network feature blocks from string definitions.
+Handles stride, dilation calculations, and selects feature extraction points.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import logging
+import math
+import re
+from copy import deepcopy
+
+import torch.nn as nn
+
+from .efficientnet_blocks import *
+from .layers import CondConv2d, get_condconv_initializer
+
+__all__ = ["EfficientNetBuilder", "decode_arch_def", "efficientnet_init_weights"]
+
+_logger = logging.getLogger(__name__)
+
+
+def _log_info_if(msg, condition):
+    if condition:
+        _logger.info(msg)
+
+
+def _parse_ksize(ss):
+    if ss.isdigit():
+        return int(ss)
+    else:
+        return [int(k) for k in ss.split('.')]
+
+
+def _decode_block_str(block_str):
+    """ Decode block definition string
+
+    Gets a list of block arg (dicts) through a string notation of arguments.
+    E.g. ir_r2_k3_s2_e1_i32_o16_se0.25_noskip
+
+    All args can exist in any order with the exception of the leading string which
+    is assumed to indicate the block type.
+
+    leading string - block type (
+      ir = InvertedResidual, ds = DepthwiseSep, dsa = DeptwhiseSep with pw act, cn = ConvBnAct)
+    r - number of repeat blocks,
+    k - kernel size,
+    s - strides (1-9),
+    e - expansion ratio,
+    c - output channels,
+    se - squeeze/excitation ratio
+    n - activation fn ('re', 'r6', 'hs', or 'sw')
+    Args:
+        block_str: a string representation of block arguments.
+    Returns:
+        A list of block args (dicts)
+    Raises:
+        ValueError: if the string def not properly specified (TODO)
+    """
+    assert isinstance(block_str, str)
+    ops = block_str.split('_')
+    block_type = ops[0]  # take the block type off the front
+    ops = ops[1:]
+    options = {}
+    noskip = False
+    for op in ops:
+        # string options being checked on individual basis, combine if they grow
+        if op == 'noskip':
+            noskip = True
+        elif op.startswith('n'):
+            # activation fn
+            key = op[0]
+            v = op[1:]
+            if v == 're':
+                value = get_act_layer('relu')
+            elif v == 'r6':
+                value = get_act_layer('relu6')
+            elif v == 'hs':
+                value = get_act_layer('hard_swish')
+            elif v == 'sw':
+                value = get_act_layer('swish')
+            else:
+                continue
+            options[key] = value
+        else:
+            # all numeric options
+            splits = re.split(r'(\d.*)', op)
+            if len(splits) >= 2:
+                key, value = splits[:2]
+                options[key] = value
+
+    # if act_layer is None, the model default (passed to model init) will be used
+    act_layer = options['n'] if 'n' in options else None
+    exp_kernel_size = _parse_ksize(options['a']) if 'a' in options else 1
+    pw_kernel_size = _parse_ksize(options['p']) if 'p' in options else 1
+    fake_in_chs = int(options['fc']) if 'fc' in options else 0  # FIXME hack to deal with in_chs issue in TPU def
+
+    num_repeat = int(options['r'])
+    # each type of block has different valid arguments, fill accordingly
+    if block_type == 'ir':
+        block_args = dict(
+            block_type=block_type,
+            dw_kernel_size=_parse_ksize(options['k']),
+            exp_kernel_size=exp_kernel_size,
+            pw_kernel_size=pw_kernel_size,
+            out_chs=int(options['c']),
+            exp_ratio=float(options['e']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=int(options['s']),
+            act_layer=act_layer,
+            noskip=noskip,
+        )
+        if 'cc' in options:
+            block_args['num_experts'] = int(options['cc'])
+    elif block_type == 'ds' or block_type == 'dsa':
+        block_args = dict(
+            block_type=block_type,
+            dw_kernel_size=_parse_ksize(options['k']),
+            pw_kernel_size=pw_kernel_size,
+            out_chs=int(options['c']),
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=int(options['s']),
+            act_layer=act_layer,
+            pw_act=block_type == 'dsa',
+            noskip=block_type == 'dsa' or noskip,
+        )
+    elif block_type == 'er':
+        block_args = dict(
+            block_type=block_type,
+            exp_kernel_size=_parse_ksize(options['k']),
+            pw_kernel_size=pw_kernel_size,
+            out_chs=int(options['c']),
+            exp_ratio=float(options['e']),
+            fake_in_chs=fake_in_chs,
+            se_ratio=float(options['se']) if 'se' in options else None,
+            stride=int(options['s']),
+            act_layer=act_layer,
+            noskip=noskip,
+        )
+    elif block_type == 'cn':
+        block_args = dict(
+            block_type=block_type,
+            kernel_size=int(options['k']),
+            out_chs=int(options['c']),
+            stride=int(options['s']),
+            act_layer=act_layer,
+        )
+    else:
+        assert False, 'Unknown block type (%s)' % block_type
+
+    return block_args, num_repeat
+
+
+def _scale_stage_depth(stack_args, repeats, depth_multiplier=1.0, depth_trunc='ceil'):
+    """ Per-stage depth scaling
+    Scales the block repeats in each stage. This depth scaling impl maintains
+    compatibility with the EfficientNet scaling method, while allowing sensible
+    scaling for other models that may have multiple block arg definitions in each stage.
+    """
+
+    # We scale the total repeat count for each stage, there may be multiple
+    # block arg defs per stage so we need to sum.
+    num_repeat = sum(repeats)
+    if depth_trunc == 'round':
+        # Truncating to int by rounding allows stages with few repeats to remain
+        # proportionally smaller for longer. This is a good choice when stage definitions
+        # include single repeat stages that we'd prefer to keep that way as long as possible
+        num_repeat_scaled = max(1, round(num_repeat * depth_multiplier))
+    else:
+        # The default for EfficientNet truncates repeats to int via 'ceil'.
+        # Any multiplier > 1.0 will result in an increased depth for every stage.
+        num_repeat_scaled = int(math.ceil(num_repeat * depth_multiplier))
+
+    # Proportionally distribute repeat count scaling to each block definition in the stage.
+    # Allocation is done in reverse as it results in the first block being less likely to be scaled.
+    # The first block makes less sense to repeat in most of the arch definitions.
+    repeats_scaled = []
+    for r in repeats[::-1]:
+        rs = max(1, round((r / num_repeat * num_repeat_scaled)))
+        repeats_scaled.append(rs)
+        num_repeat -= r
+        num_repeat_scaled -= rs
+    repeats_scaled = repeats_scaled[::-1]
+
+    # Apply the calculated scaling to each block arg in the stage
+    sa_scaled = []
+    for ba, rep in zip(stack_args, repeats_scaled):
+        sa_scaled.extend([deepcopy(ba) for _ in range(rep)])
+    return sa_scaled
+
+
+def decode_arch_def(arch_def, depth_multiplier=1.0, depth_trunc='ceil', experts_multiplier=1, fix_first_last=False):
+    arch_args = []
+    for stack_idx, block_strings in enumerate(arch_def):
+        assert isinstance(block_strings, list)
+        stack_args = []
+        repeats = []
+        for block_str in block_strings:
+            assert isinstance(block_str, str)
+            ba, rep = _decode_block_str(block_str)
+            if ba.get('num_experts', 0) > 0 and experts_multiplier > 1:
+                ba['num_experts'] *= experts_multiplier
+            stack_args.append(ba)
+            repeats.append(rep)
+        if fix_first_last and (stack_idx == 0 or stack_idx == len(arch_def) - 1):
+            arch_args.append(_scale_stage_depth(stack_args, repeats, 1.0, depth_trunc))
+        else:
+            arch_args.append(_scale_stage_depth(stack_args, repeats, depth_multiplier, depth_trunc))
+    return arch_args
+
+
+class EfficientNetBuilder:
+    """ Build Trunk Blocks
+
+    This ended up being somewhat of a cross between
+    https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_models.py
+    and
+    https://github.com/facebookresearch/maskrcnn-benchmark/blob/master/maskrcnn_benchmark/modeling/backbone/fbnet_builder.py
+
+    """
+    def __init__(self, channel_multiplier=1.0, channel_divisor=8, channel_min=None,
+                 output_stride=32, pad_type='', act_layer=None, se_kwargs=None,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, drop_path_rate=0., feature_location='',
+                 verbose=False):
+        self.channel_multiplier = channel_multiplier
+        self.channel_divisor = channel_divisor
+        self.channel_min = channel_min
+        self.output_stride = output_stride
+        self.pad_type = pad_type
+        self.act_layer = act_layer
+        self.se_kwargs = se_kwargs
+        self.norm_layer = norm_layer
+        self.norm_kwargs = norm_kwargs
+        self.drop_path_rate = drop_path_rate
+        if feature_location == 'depthwise':
+            # old 'depthwise' mode renamed 'expansion' to match TF impl, old expansion mode didn't make sense
+            _logger.warning("feature_location=='depthwise' is deprecated, using 'expansion'")
+            feature_location = 'expansion'
+        self.feature_location = feature_location
+        assert feature_location in ('bottleneck', 'expansion', '')
+        self.verbose = verbose
+
+        # state updated during build, consumed by model
+        self.in_chs = None
+        self.features = []
+
+    def _round_channels(self, chs):
+        return round_channels(chs, self.channel_multiplier, self.channel_divisor, self.channel_min)
+
+    def _make_block(self, ba, block_idx, block_count):
+        drop_path_rate = self.drop_path_rate * block_idx / block_count
+        bt = ba.pop('block_type')
+        ba['in_chs'] = self.in_chs
+        ba['out_chs'] = self._round_channels(ba['out_chs'])
+        if 'fake_in_chs' in ba and ba['fake_in_chs']:
+            # FIXME this is a hack to work around mismatch in origin impl input filters
+            ba['fake_in_chs'] = self._round_channels(ba['fake_in_chs'])
+        ba['norm_layer'] = self.norm_layer
+        ba['norm_kwargs'] = self.norm_kwargs
+        ba['pad_type'] = self.pad_type
+        # block act fn overrides the model default
+        ba['act_layer'] = ba['act_layer'] if ba['act_layer'] is not None else self.act_layer
+        assert ba['act_layer'] is not None
+        if bt == 'ir':
+            ba['drop_path_rate'] = drop_path_rate
+            ba['se_kwargs'] = self.se_kwargs
+            _log_info_if('  InvertedResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            if ba.get('num_experts', 0) > 0:
+                block = CondConvResidual(**ba)
+            else:
+                block = InvertedResidual(**ba)
+        elif bt == 'ds' or bt == 'dsa':
+            ba['drop_path_rate'] = drop_path_rate
+            ba['se_kwargs'] = self.se_kwargs
+            _log_info_if('  DepthwiseSeparable {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            block = DepthwiseSeparableConv(**ba)
+        elif bt == 'er':
+            ba['drop_path_rate'] = drop_path_rate
+            ba['se_kwargs'] = self.se_kwargs
+            _log_info_if('  EdgeResidual {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            block = EdgeResidual(**ba)
+        elif bt == 'cn':
+            _log_info_if('  ConvBnAct {}, Args: {}'.format(block_idx, str(ba)), self.verbose)
+            block = ConvBnAct(**ba)
+        else:
+            assert False, 'Uknkown block type (%s) while building model.' % bt
+        self.in_chs = ba['out_chs']  # update in_chs for arg of next block
+
+        return block
+
+    def __call__(self, in_chs, model_block_args):
+        """ Build the blocks
+        Args:
+            in_chs: Number of input-channels passed to first block
+            model_block_args: A list of lists, outer list defines stages, inner
+                list contains strings defining block configuration(s)
+        Return:
+             List of block stacks (each stack wrapped in nn.Sequential)
+        """
+        _log_info_if('Building model trunk with %d stages...' % len(model_block_args), self.verbose)
+        self.in_chs = in_chs
+        total_block_count = sum([len(x) for x in model_block_args])
+        total_block_idx = 0
+        current_stride = 2
+        current_dilation = 1
+        stages = []
+        if model_block_args[0][0]['stride'] > 1:
+            # if the first block starts with a stride, we need to extract first level feat from stem
+            feature_info = dict(
+                module='act1', num_chs=in_chs, stage=0, reduction=current_stride,
+                hook_type='forward' if self.feature_location != 'bottleneck' else '')
+            self.features.append(feature_info)
+
+        # outer list of block_args defines the stacks
+        for stack_idx, stack_args in enumerate(model_block_args):
+            last_stack = stack_idx + 1 == len(model_block_args)
+            _log_info_if('Stack: {}'.format(stack_idx), self.verbose)
+            assert isinstance(stack_args, list)
+
+            blocks = []
+            # each stack (stage of blocks) contains a list of block arguments
+            for block_idx, block_args in enumerate(stack_args):
+                last_block = block_idx + 1 == len(stack_args)
+                _log_info_if(' Block: {}'.format(block_idx), self.verbose)
+
+                assert block_args['stride'] in (1, 2)
+                if block_idx >= 1:   # only the first block in any stack can have a stride > 1
+                    block_args['stride'] = 1
+
+                extract_features = False
+                if last_block:
+                    next_stack_idx = stack_idx + 1
+                    extract_features = next_stack_idx >= len(model_block_args) or \
+                        model_block_args[next_stack_idx][0]['stride'] > 1
+
+                next_dilation = current_dilation
+                if block_args['stride'] > 1:
+                    next_output_stride = current_stride * block_args['stride']
+                    if next_output_stride > self.output_stride:
+                        next_dilation = current_dilation * block_args['stride']
+                        block_args['stride'] = 1
+                        _log_info_if('  Converting stride to dilation to maintain output_stride=={}'.format(
+                            self.output_stride), self.verbose)
+                    else:
+                        current_stride = next_output_stride
+                block_args['dilation'] = current_dilation
+                if next_dilation != current_dilation:
+                    current_dilation = next_dilation
+
+                # create the block
+                block = self._make_block(block_args, total_block_idx, total_block_count)
+                blocks.append(block)
+
+                # stash feature module name and channel info for model feature extraction
+                if extract_features:
+                    feature_info = dict(
+                        stage=stack_idx + 1, reduction=current_stride, **block.feature_info(self.feature_location))
+                    module_name = f'blocks.{stack_idx}.{block_idx}'
+                    leaf_name = feature_info.get('module', '')
+                    feature_info['module'] = '.'.join([module_name, leaf_name]) if leaf_name else module_name
+                    self.features.append(feature_info)
+
+                total_block_idx += 1  # incr global block idx (across all stacks)
+            stages.append(nn.Sequential(*blocks))
+        return stages
+
+
+def _init_weight_goog(m, n='', fix_group_fanout=True):
+    """ Weight initialization as per Tensorflow official implementations.
+
+    Args:
+        m (nn.Module): module to init
+        n (str): module name
+        fix_group_fanout (bool): enable correct (matching Tensorflow TPU impl) fanout calculation w/ group convs
+
+    Handles layers in EfficientNet, EfficientNet-CondConv, MixNet, MnasNet, MobileNetV3, etc:
+    * https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mnasnet_model.py
+    * https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/efficientnet_model.py
+    """
+    if isinstance(m, CondConv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        if fix_group_fanout:
+            fan_out //= m.groups
+        init_weight_fn = get_condconv_initializer(
+            lambda w: w.data.normal_(0, math.sqrt(2.0 / fan_out)), m.num_experts, m.weight_shape)
+        init_weight_fn(m.weight)
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.Conv2d):
+        fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+        if fix_group_fanout:
+            fan_out //= m.groups
+        m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+        if m.bias is not None:
+            m.bias.data.zero_()
+    elif isinstance(m, nn.BatchNorm2d):
+        m.weight.data.fill_(1.0)
+        m.bias.data.zero_()
+    elif isinstance(m, nn.Linear):
+        fan_out = m.weight.size(0)  # fan-out
+        fan_in = 0
+        if 'routing_fn' in n:
+            fan_in = m.weight.size(1)
+        init_range = 1.0 / math.sqrt(fan_in + fan_out)
+        m.weight.data.uniform_(-init_range, init_range)
+        m.bias.data.zero_()
+
+
+def efficientnet_init_weights(model: nn.Module, init_fn=None):
+    init_fn = init_fn or _init_weight_goog
+    for n, m in model.named_modules():
+        init_fn(m, n)
+
diff --git a/timm/models/quantization/layers/__init__.py b/timm/models/quantization/layers/__init__.py
new file mode 100644
index 00000000..2f7326ca
--- /dev/null
+++ b/timm/models/quantization/layers/__init__.py
@@ -0,0 +1,31 @@
+from .activations import *
+from .adaptive_avgmax_pool import \
+    adaptive_avgmax_pool2d, select_adaptive_pool2d, AdaptiveAvgMaxPool2d, SelectAdaptivePool2d
+from .anti_aliasing import AntiAliasDownsampleLayer
+from .blur_pool import BlurPool2d
+from .classifier import ClassifierHead, create_classifier
+from .cond_conv2d import CondConv2d, get_condconv_initializer
+from .config import is_exportable, is_scriptable, is_no_jit, set_exportable, set_scriptable, set_no_jit,\
+    set_layer_config
+from .conv2d_same import Conv2dSame
+from .conv_bn_act import ConvBnAct
+from .create_act import create_act_layer, get_act_layer
+
+from .create_conv2d import create_conv2d
+from .create_norm_act import create_norm_act, get_norm_act_layer
+from .drop import DropBlock2d, DropPath, drop_block_2d, drop_path
+
+from .evo_norm import EvoNormBatch2d, EvoNormSample2d
+from .inplace_abn import InplaceAbn
+from .mixed_conv2d import MixedConv2d
+from .norm_act import BatchNormAct2d
+from .padding import get_padding
+from .pool2d_same import AvgPool2dSame, create_pool2d
+
+from .selective_kernel import SelectiveKernelConv
+from .separable_conv import SeparableConv2d, SeparableConvBnAct
+from .space_to_depth import SpaceToDepthModule
+from .split_attn import SplitAttnConv2d
+from .split_batchnorm import SplitBatchNorm2d, convert_splitbn_model
+from .test_time_pool import TestTimePoolHead, apply_test_time_pool
+from .weight_init import trunc_normal_
diff --git a/timm/models/quantization/layers/activations.py b/timm/models/quantization/layers/activations.py
new file mode 100644
index 00000000..76c71b03
--- /dev/null
+++ b/timm/models/quantization/layers/activations.py
@@ -0,0 +1,66 @@
+""" Activations
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class Swish(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(Swish, self).__init__()
+        nn.sig = nn.Sigmoid()
+        self.skip = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        out = nn.sig(x)
+
+        return self.skip.mul(x,out)
+
+def sigmoid(x, inplace: bool = False):
+    return x.sigmoid_() if inplace else x.sigmoid()
+
+
+# PyTorch has this, but not with a consistent inplace argmument interface
+class Sigmoid(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(Sigmoid, self).__init__()
+        nn.sig = nn.Sigmoid()
+    def forward(self, x):
+        out = nn.sig(x)
+        return out
+
+
+class HardSwish(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSwish, self).__init__()
+        self.relu6 = nn.ReLU6(inplace)
+        self.quant_mul1 = nn.quantized.FloatFunctional()
+        self.quant_mul2 = nn.quantized.FloatFunctional()
+        self.quant_add = nn.quantized.FloatFunctional()
+        
+    def forward(self, x):
+        out = self.quant_add.add_scalar(x, 3.0)
+        out = self.relu6(out)
+        out = self.quant_mul1.mul(x,out)
+        out = self.quant_mul2.mul_scalar(out, 1/6)
+        return out
+
+
+
+class HardSigmoid(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSigmoid, self).__init__()
+        self.relu6 = nn.ReLU6(inplace)
+        self.quant_add = nn.quantized.FloatFunctional()
+        self.quant_mul = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        out = self.quant_add.add_scalar(x, 3.0)
+        out = self.relu6(out)
+        out = self.quant_mul.mul_scalar(out,1/6)
+        return out
+
diff --git a/timm/models/quantization/layers/activations_jit.py b/timm/models/quantization/layers/activations_jit.py
new file mode 100644
index 00000000..b4a51653
--- /dev/null
+++ b/timm/models/quantization/layers/activations_jit.py
@@ -0,0 +1,90 @@
+""" Activations
+
+A collection of jit-scripted activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+All jit scripted activations are lacking in-place variations on purpose, scripted kernel fusion does not
+currently work across in-place op boundaries, thus performance is equal to or less than the non-scripted
+versions if they contain in-place ops.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+@torch.jit.script
+def swish_jit(x, inplace: bool = False):
+    """Swish - Described in: https://arxiv.org/abs/1710.05941
+    """
+    return x.mul(x.sigmoid())
+
+
+@torch.jit.script
+def mish_jit(x, _inplace: bool = False):
+    """Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+    """
+    return x.mul(F.softplus(x).tanh())
+
+
+class SwishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(SwishJit, self).__init__()
+
+    def forward(self, x):
+        return swish_jit(x)
+
+
+class MishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(MishJit, self).__init__()
+
+    def forward(self, x):
+        return mish_jit(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit(x, inplace: bool = False):
+    # return F.relu6(x + 3.) / 6.
+    return (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
+
+
+class HardSigmoidJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSigmoidJit, self).__init__()
+
+    def forward(self, x):
+        return hard_sigmoid_jit(x)
+
+
+@torch.jit.script
+def hard_swish_jit(x, inplace: bool = False):
+    # return x * (F.relu6(x + 3.) / 6)
+    return x * (x + 3).clamp(min=0, max=6).div(6.)  # clamp seems ever so slightly faster?
+
+
+class HardSwishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSwishJit, self).__init__()
+
+    def forward(self, x):
+        return hard_swish_jit(x)
+
+
+@torch.jit.script
+def hard_mish_jit(x, inplace: bool = False):
+    """ Hard Mish
+    Experimental, based on notes by Mish author Diganta Misra at
+      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
+    """
+    return 0.5 * x * (x + 2).clamp(min=0, max=2)
+
+
+class HardMishJit(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardMishJit, self).__init__()
+
+    def forward(self, x):
+        return hard_mish_jit(x)
diff --git a/timm/models/quantization/layers/activations_me.py b/timm/models/quantization/layers/activations_me.py
new file mode 100644
index 00000000..0441f7c4
--- /dev/null
+++ b/timm/models/quantization/layers/activations_me.py
@@ -0,0 +1,208 @@
+""" Activations (memory-efficient w/ custom autograd)
+
+A collection of activations fn and modules with a common interface so that they can
+easily be swapped. All have an `inplace` arg even if not used.
+
+These activations are not compatible with jit scripting or ONNX export of the model, please use either
+the JIT or basic versions of the activations.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+@torch.jit.script
+def swish_jit_fwd(x):
+    return x.mul(torch.sigmoid(x))
+
+
+@torch.jit.script
+def swish_jit_bwd(x, grad_output):
+    x_sigmoid = torch.sigmoid(x)
+    return grad_output * (x_sigmoid * (1 + x * (1 - x_sigmoid)))
+
+
+class SwishJitAutoFn(torch.autograd.Function):
+    """ torch.jit.script optimised Swish w/ memory-efficient checkpoint
+    Inspired by conversation btw Jeremy Howard & Adam Pazske
+    https://twitter.com/jeremyphoward/status/1188251041835315200
+    """
+
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return swish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return swish_jit_bwd(x, grad_output)
+
+
+def swish_me(x, inplace=False):
+    return SwishJitAutoFn.apply(x)
+
+
+class SwishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(SwishMe, self).__init__()
+
+    def forward(self, x):
+        return SwishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def mish_jit_fwd(x):
+    return x.mul(torch.tanh(F.softplus(x)))
+
+
+@torch.jit.script
+def mish_jit_bwd(x, grad_output):
+    x_sigmoid = torch.sigmoid(x)
+    x_tanh_sp = F.softplus(x).tanh()
+    return grad_output.mul(x_tanh_sp + x * x_sigmoid * (1 - x_tanh_sp * x_tanh_sp))
+
+
+class MishJitAutoFn(torch.autograd.Function):
+    """ Mish: A Self Regularized Non-Monotonic Neural Activation Function - https://arxiv.org/abs/1908.08681
+    A memory efficient, jit scripted variant of Mish
+    """
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return mish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return mish_jit_bwd(x, grad_output)
+
+
+def mish_me(x, inplace=False):
+    return MishJitAutoFn.apply(x)
+
+
+class MishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(MishMe, self).__init__()
+
+    def forward(self, x):
+        return MishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_fwd(x, inplace: bool = False):
+    return (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_sigmoid_jit_bwd(x, grad_output):
+    m = torch.ones_like(x) * ((x >= -3.) & (x <= 3.)) / 6.
+    return grad_output * m
+
+
+class HardSigmoidJitAutoFn(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return hard_sigmoid_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return hard_sigmoid_jit_bwd(x, grad_output)
+
+
+def hard_sigmoid_me(x, inplace: bool = False):
+    return HardSigmoidJitAutoFn.apply(x)
+
+
+class HardSigmoidMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSigmoidMe, self).__init__()
+
+    def forward(self, x):
+        return HardSigmoidJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_swish_jit_fwd(x):
+    return x * (x + 3).clamp(min=0, max=6).div(6.)
+
+
+@torch.jit.script
+def hard_swish_jit_bwd(x, grad_output):
+    m = torch.ones_like(x) * (x >= 3.)
+    m = torch.where((x >= -3.) & (x <= 3.),  x / 3. + .5, m)
+    return grad_output * m
+
+
+class HardSwishJitAutoFn(torch.autograd.Function):
+    """A memory efficient, jit-scripted HardSwish activation"""
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return hard_swish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return hard_swish_jit_bwd(x, grad_output)
+
+
+def hard_swish_me(x, inplace=False):
+    return HardSwishJitAutoFn.apply(x)
+
+
+class HardSwishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardSwishMe, self).__init__()
+
+    def forward(self, x):
+        return HardSwishJitAutoFn.apply(x)
+
+
+@torch.jit.script
+def hard_mish_jit_fwd(x):
+    return 0.5 * x * (x + 2).clamp(min=0, max=2)
+
+
+@torch.jit.script
+def hard_mish_jit_bwd(x, grad_output):
+    m = torch.ones_like(x) * (x >= -2.)
+    m = torch.where((x >= -2.) & (x <= 0.), x + 1., m)
+    return grad_output * m
+
+
+class HardMishJitAutoFn(torch.autograd.Function):
+    """ A memory efficient, jit scripted variant of Hard Mish
+    Experimental, based on notes by Mish author Diganta Misra at
+      https://github.com/digantamisra98/H-Mish/blob/0da20d4bc58e696b6803f2523c58d3c8a82782d0/README.md
+    """
+    @staticmethod
+    def forward(ctx, x):
+        ctx.save_for_backward(x)
+        return hard_mish_jit_fwd(x)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x = ctx.saved_tensors[0]
+        return hard_mish_jit_bwd(x, grad_output)
+
+
+def hard_mish_me(x, inplace: bool = False):
+    return HardMishJitAutoFn.apply(x)
+
+
+class HardMishMe(nn.Module):
+    def __init__(self, inplace: bool = False):
+        super(HardMishMe, self).__init__()
+
+    def forward(self, x):
+        return HardMishJitAutoFn.apply(x)
+
+
+
diff --git a/timm/models/quantization/layers/adaptive_avgmax_pool.py b/timm/models/quantization/layers/adaptive_avgmax_pool.py
new file mode 100644
index 00000000..482c0c01
--- /dev/null
+++ b/timm/models/quantization/layers/adaptive_avgmax_pool.py
@@ -0,0 +1,106 @@
+""" PyTorch selectable adaptive pooling
+Adaptive pooling with the ability to select the type of pooling from:
+    * 'avg' - Average pooling
+    * 'max' - Max pooling
+    * 'avgmax' - Sum of average and max pooling re-scaled by 0.5
+    * 'avgmaxc' - Concatenation of average and max pooling along feature dim, doubles feature dim
+
+Both a functional and a nn.Module version of the pooling is provided.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def adaptive_pool_feat_mult(pool_type='avg'):
+    if pool_type == 'catavgmax':
+        return 2
+    else:
+        return 1
+
+
+def adaptive_avgmax_pool2d(x, output_size=1):
+    x_avg = F.adaptive_avg_pool2d(x, output_size)
+    x_max = F.adaptive_max_pool2d(x, output_size)
+    return 0.5 * (x_avg + x_max)
+
+
+def adaptive_catavgmax_pool2d(x, output_size=1):
+    x_avg = F.adaptive_avg_pool2d(x, output_size)
+    x_max = F.adaptive_max_pool2d(x, output_size)
+    return torch.cat((x_avg, x_max), 1)
+
+
+def select_adaptive_pool2d(x, pool_type='avg', output_size=1):
+    """Selectable global pooling function with dynamic input kernel size
+    """
+    if pool_type == 'avg':
+        x = F.adaptive_avg_pool2d(x, output_size)
+    elif pool_type == 'avgmax':
+        x = adaptive_avgmax_pool2d(x, output_size)
+    elif pool_type == 'catavgmax':
+        x = adaptive_catavgmax_pool2d(x, output_size)
+    elif pool_type == 'max':
+        x = F.adaptive_max_pool2d(x, output_size)
+    else:
+        assert False, 'Invalid pool type: %s' % pool_type
+    return x
+
+
+class AdaptiveAvgMaxPool2d(nn.Module):
+    def __init__(self, output_size=1):
+        super(AdaptiveAvgMaxPool2d, self).__init__()
+        self.output_size = output_size
+
+    def forward(self, x):
+        return adaptive_avgmax_pool2d(x, self.output_size)
+
+
+class AdaptiveCatAvgMaxPool2d(nn.Module):
+    def __init__(self, output_size=1):
+        super(AdaptiveCatAvgMaxPool2d, self).__init__()
+        self.output_size = output_size
+
+    def forward(self, x):
+        return adaptive_catavgmax_pool2d(x, self.output_size)
+
+
+class SelectAdaptivePool2d(nn.Module):
+    """Selectable global pooling layer with dynamic input kernel size
+    """
+    def __init__(self, output_size=1, pool_type='avg', flatten=False):
+        super(SelectAdaptivePool2d, self).__init__()
+        self.pool_type = pool_type or ''  # convert other falsy values to empty string for consistent TS typing
+        self.flatten = flatten
+        if pool_type == '':
+            self.pool = nn.Identity()  # pass through
+        elif pool_type == 'avg':
+            self.pool = nn.AdaptiveAvgPool2d(output_size)
+        elif pool_type == 'avgmax':
+            self.pool = AdaptiveAvgMaxPool2d(output_size)
+        elif pool_type == 'catavgmax':
+            self.pool = AdaptiveCatAvgMaxPool2d(output_size)
+        elif pool_type == 'max':
+            self.pool = nn.AdaptiveMaxPool2d(output_size)
+        else:
+            assert False, 'Invalid pool type: %s' % pool_type
+
+    def is_identity(self):
+        return self.pool_type == ''
+
+    def forward(self, x):
+        x = self.pool(x)
+        if self.flatten:
+            x = x.flatten(1)
+        return x
+
+    def feat_mult(self):
+        return adaptive_pool_feat_mult(self.pool_type)
+
+    def __repr__(self):
+        return self.__class__.__name__ + ' (' \
+               + 'pool_type=' + self.pool_type \
+               + ', flatten=' + str(self.flatten) + ')'
+
diff --git a/timm/models/quantization/layers/anti_aliasing.py b/timm/models/quantization/layers/anti_aliasing.py
new file mode 100644
index 00000000..9d3837e8
--- /dev/null
+++ b/timm/models/quantization/layers/anti_aliasing.py
@@ -0,0 +1,60 @@
+import torch
+import torch.nn.parallel
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AntiAliasDownsampleLayer(nn.Module):
+    def __init__(self, channels: int = 0, filt_size: int = 3, stride: int = 2, no_jit: bool = False):
+        super(AntiAliasDownsampleLayer, self).__init__()
+        if no_jit:
+            self.op = Downsample(channels, filt_size, stride)
+        else:
+            self.op = DownsampleJIT(channels, filt_size, stride)
+
+        # FIXME I should probably override _apply and clear DownsampleJIT filter cache for .cuda(), .half(), etc calls
+
+    def forward(self, x):
+        return self.op(x)
+
+
+@torch.jit.script
+class DownsampleJIT(object):
+    def __init__(self, channels: int = 0, filt_size: int = 3, stride: int = 2):
+        self.channels = channels
+        self.stride = stride
+        self.filt_size = filt_size
+        assert self.filt_size == 3
+        assert stride == 2
+        self.filt = {}  # lazy init by device for DataParallel compat
+
+    def _create_filter(self, like: torch.Tensor):
+        filt = torch.tensor([1., 2., 1.], dtype=like.dtype, device=like.device)
+        filt = filt[:, None] * filt[None, :]
+        filt = filt / torch.sum(filt)
+        return filt[None, None, :, :].repeat((self.channels, 1, 1, 1))
+
+    def __call__(self, input: torch.Tensor):
+        input_pad = F.pad(input, (1, 1, 1, 1), 'reflect')
+        filt = self.filt.get(str(input.device), self._create_filter(input))
+        return F.conv2d(input_pad, filt, stride=2, padding=0, groups=input.shape[1])
+
+
+class Downsample(nn.Module):
+    def __init__(self, channels=None, filt_size=3, stride=2):
+        super(Downsample, self).__init__()
+        self.channels = channels
+        self.filt_size = filt_size
+        self.stride = stride
+
+        assert self.filt_size == 3
+        filt = torch.tensor([1., 2., 1.])
+        filt = filt[:, None] * filt[None, :]
+        filt = filt / torch.sum(filt)
+
+        # self.filt = filt[None, None, :, :].repeat((self.channels, 1, 1, 1))
+        self.register_buffer('filt', filt[None, None, :, :].repeat((self.channels, 1, 1, 1)))
+
+    def forward(self, input):
+        input_pad = F.pad(input, (1, 1, 1, 1), 'reflect')
+        return F.conv2d(input_pad, self.filt, stride=self.stride, padding=0, groups=input.shape[1])
diff --git a/timm/models/quantization/layers/blur_pool.py b/timm/models/quantization/layers/blur_pool.py
new file mode 100644
index 00000000..399cbe35
--- /dev/null
+++ b/timm/models/quantization/layers/blur_pool.py
@@ -0,0 +1,58 @@
+"""
+BlurPool layer inspired by
+ - Kornia's Max_BlurPool2d
+ - Making Convolutional Networks Shift-Invariant Again :cite:`zhang2019shiftinvar`
+
+FIXME merge this impl with those in `anti_aliasing.py`
+
+Hacked together by Chris Ha and Ross Wightman
+"""
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from typing import Dict
+from .padding import get_padding
+
+
+class BlurPool2d(nn.Module):
+    r"""Creates a module that computes blurs and downsample a given feature map.
+    See :cite:`zhang2019shiftinvar` for more details.
+    Corresponds to the Downsample class, which does blurring and subsampling
+
+    Args:
+        channels = Number of input channels
+        filt_size (int): binomial filter size for blurring. currently supports 3 (default) and 5.
+        stride (int): downsampling filter stride
+
+    Returns:
+        torch.Tensor: the transformed tensor.
+    """
+    filt: Dict[str, torch.Tensor]
+
+    def __init__(self, channels, filt_size=3, stride=2) -> None:
+        super(BlurPool2d, self).__init__()
+        assert filt_size > 1
+        self.channels = channels
+        self.filt_size = filt_size
+        self.stride = stride
+        pad_size = [get_padding(filt_size, stride, dilation=1)] * 4
+        self.padding = nn.ReflectionPad2d(pad_size)
+        self._coeffs = torch.tensor((np.poly1d((0.5, 0.5)) ** (self.filt_size - 1)).coeffs)  # for torchscript compat
+        self.filt = {}  # lazy init by device for DataParallel compat
+
+    def _create_filter(self, like: torch.Tensor):
+        blur_filter = (self._coeffs[:, None] * self._coeffs[None, :]).to(dtype=like.dtype, device=like.device)
+        return blur_filter[None, None, :, :].repeat(self.channels, 1, 1, 1)
+
+    def _apply(self, fn):
+        # override nn.Module _apply, reset filter cache if used
+        self.filt = {}
+        super(BlurPool2d, self)._apply(fn)
+
+    def forward(self, input_tensor: torch.Tensor) -> torch.Tensor:
+        C = input_tensor.shape[1]
+        blur_filt = self.filt.get(str(input_tensor.device), self._create_filter(input_tensor))
+        return F.conv2d(
+            self.padding(input_tensor), blur_filt, stride=self.stride, groups=C)
diff --git a/timm/models/quantization/layers/classifier.py b/timm/models/quantization/layers/classifier.py
new file mode 100644
index 00000000..e9194f05
--- /dev/null
+++ b/timm/models/quantization/layers/classifier.py
@@ -0,0 +1,41 @@
+""" Classifier head and layer factory
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from torch import nn as nn
+from torch.nn import functional as F
+
+from .adaptive_avgmax_pool import SelectAdaptivePool2d
+
+
+def create_classifier(num_features, num_classes, pool_type='avg', use_conv=False):
+    flatten = not use_conv  # flatten when we use a Linear layer after pooling
+    if not pool_type:
+        assert num_classes == 0 or use_conv,\
+            'Pooling can only be disabled if classifier is also removed or conv classifier is used'
+        flatten = False  # disable flattening if pooling is pass-through (no pooling)
+    global_pool = SelectAdaptivePool2d(pool_type=pool_type, flatten=flatten)
+    num_pooled_features = num_features * global_pool.feat_mult()
+    if num_classes <= 0:
+        fc = nn.Identity()  # pass-through (no classifier)
+    elif use_conv:
+        fc = nn.Conv2d(num_pooled_features, num_classes, 1, bias=True)
+    else:
+        fc = nn.Linear(num_pooled_features, num_classes, bias=True)
+    return global_pool, fc
+
+
+class ClassifierHead(nn.Module):
+    """Classifier head w/ configurable global pooling and dropout."""
+
+    def __init__(self, in_chs, num_classes, pool_type='avg', drop_rate=0.):
+        super(ClassifierHead, self).__init__()
+        self.drop_rate = drop_rate
+        self.global_pool, self.fc = create_classifier(in_chs, num_classes, pool_type=pool_type)
+
+    def forward(self, x):
+        x = self.global_pool(x)
+        if self.drop_rate:
+            x = F.dropout(x, p=float(self.drop_rate), training=self.training)
+        x = self.fc(x)
+        return x
diff --git a/timm/models/quantization/layers/cond_conv2d.py b/timm/models/quantization/layers/cond_conv2d.py
new file mode 100644
index 00000000..175292b7
--- /dev/null
+++ b/timm/models/quantization/layers/cond_conv2d.py
@@ -0,0 +1,122 @@
+""" PyTorch Conditionally Parameterized Convolution (CondConv)
+
+Paper: CondConv: Conditionally Parameterized Convolutions for Efficient Inference
+(https://arxiv.org/abs/1904.04971)
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import math
+from functools import partial
+import numpy as np
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from .helpers import tup_pair
+from .conv2d_same import conv2d_same
+from .padding import get_padding_value
+
+
+def get_condconv_initializer(initializer, num_experts, expert_shape):
+    def condconv_initializer(weight):
+        """CondConv initializer function."""
+        num_params = np.prod(expert_shape)
+        if (len(weight.shape) != 2 or weight.shape[0] != num_experts or
+                weight.shape[1] != num_params):
+            raise (ValueError(
+                'CondConv variables must have shape [num_experts, num_params]'))
+        for i in range(num_experts):
+            initializer(weight[i].view(expert_shape))
+    return condconv_initializer
+
+
+class CondConv2d(nn.Module):
+    """ Conditionally Parameterized Convolution
+    Inspired by: https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/condconv/condconv_layers.py
+
+    Grouped convolution hackery for parallel execution of the per-sample kernel filters inspired by this discussion:
+    https://github.com/pytorch/pytorch/issues/17983
+    """
+    __constants__ = ['in_channels', 'out_channels', 'dynamic_padding']
+
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilation=1, groups=1, bias=False, num_experts=4):
+        super(CondConv2d, self).__init__()
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = tup_pair(kernel_size)
+        self.stride = tup_pair(stride)
+        padding_val, is_padding_dynamic = get_padding_value(
+            padding, kernel_size, stride=stride, dilation=dilation)
+        self.dynamic_padding = is_padding_dynamic  # if in forward to work with torchscript
+        self.padding = tup_pair(padding_val)
+        self.dilation = tup_pair(dilation)
+        self.groups = groups
+        self.num_experts = num_experts
+
+        self.weight_shape = (self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight_num_param = 1
+        for wd in self.weight_shape:
+            weight_num_param *= wd
+        self.weight = torch.nn.Parameter(torch.Tensor(self.num_experts, weight_num_param))
+
+        if bias:
+            self.bias_shape = (self.out_channels,)
+            self.bias = torch.nn.Parameter(torch.Tensor(self.num_experts, self.out_channels))
+        else:
+            self.register_parameter('bias', None)
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init_weight = get_condconv_initializer(
+            partial(nn.init.kaiming_uniform_, a=math.sqrt(5)), self.num_experts, self.weight_shape)
+        init_weight(self.weight)
+        if self.bias is not None:
+            fan_in = np.prod(self.weight_shape[1:])
+            bound = 1 / math.sqrt(fan_in)
+            init_bias = get_condconv_initializer(
+                partial(nn.init.uniform_, a=-bound, b=bound), self.num_experts, self.bias_shape)
+            init_bias(self.bias)
+
+    def forward(self, x, routing_weights):
+        B, C, H, W = x.shape
+        weight = torch.matmul(routing_weights, self.weight)
+        new_weight_shape = (B * self.out_channels, self.in_channels // self.groups) + self.kernel_size
+        weight = weight.view(new_weight_shape)
+        bias = None
+        if self.bias is not None:
+            bias = torch.matmul(routing_weights, self.bias)
+            bias = bias.view(B * self.out_channels)
+        # move batch elements with channels so each batch element can be efficiently convolved with separate kernel
+        x = x.view(1, B * C, H, W)
+        if self.dynamic_padding:
+            out = conv2d_same(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        else:
+            out = F.conv2d(
+                x, weight, bias, stride=self.stride, padding=self.padding,
+                dilation=self.dilation, groups=self.groups * B)
+        out = out.permute([1, 0, 2, 3]).view(B, self.out_channels, out.shape[-2], out.shape[-1])
+
+        # Literal port (from TF definition)
+        # x = torch.split(x, 1, 0)
+        # weight = torch.split(weight, 1, 0)
+        # if self.bias is not None:
+        #     bias = torch.matmul(routing_weights, self.bias)
+        #     bias = torch.split(bias, 1, 0)
+        # else:
+        #     bias = [None] * B
+        # out = []
+        # for xi, wi, bi in zip(x, weight, bias):
+        #     wi = wi.view(*self.weight_shape)
+        #     if bi is not None:
+        #         bi = bi.view(*self.bias_shape)
+        #     out.append(self.conv_fn(
+        #         xi, wi, bi, stride=self.stride, padding=self.padding,
+        #         dilation=self.dilation, groups=self.groups))
+        # out = torch.cat(out, 0)
+        return out
diff --git a/timm/models/quantization/layers/config.py b/timm/models/quantization/layers/config.py
new file mode 100644
index 00000000..f07b9d78
--- /dev/null
+++ b/timm/models/quantization/layers/config.py
@@ -0,0 +1,115 @@
+""" Model / Layer Config singleton state
+"""
+from typing import Any, Optional
+
+__all__ = [
+    'is_exportable', 'is_scriptable', 'is_no_jit',
+    'set_exportable', 'set_scriptable', 'set_no_jit', 'set_layer_config'
+]
+
+# Set to True if prefer to have layers with no jit optimization (includes activations)
+_NO_JIT = False
+
+# Set to True if prefer to have activation layers with no jit optimization
+# NOTE not currently used as no difference between no_jit and no_activation jit as only layers obeying
+# the jit flags so far are activations. This will change as more layers are updated and/or added.
+_NO_ACTIVATION_JIT = False
+
+# Set to True if exporting a model with Same padding via ONNX
+_EXPORTABLE = False
+
+# Set to True if wanting to use torch.jit.script on a model
+_SCRIPTABLE = False
+
+
+def is_no_jit():
+    return _NO_JIT
+
+
+class set_no_jit:
+    def __init__(self, mode: bool) -> None:
+        global _NO_JIT
+        self.prev = _NO_JIT
+        _NO_JIT = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _NO_JIT
+        _NO_JIT = self.prev
+        return False
+
+
+def is_exportable():
+    return _EXPORTABLE
+
+
+class set_exportable:
+    def __init__(self, mode: bool) -> None:
+        global _EXPORTABLE
+        self.prev = _EXPORTABLE
+        _EXPORTABLE = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _EXPORTABLE
+        _EXPORTABLE = self.prev
+        return False
+
+
+def is_scriptable():
+    return _SCRIPTABLE
+
+
+class set_scriptable:
+    def __init__(self, mode: bool) -> None:
+        global _SCRIPTABLE
+        self.prev = _SCRIPTABLE
+        _SCRIPTABLE = mode
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _SCRIPTABLE
+        _SCRIPTABLE = self.prev
+        return False
+
+
+class set_layer_config:
+    """ Layer config context manager that allows setting all layer config flags at once.
+    If a flag arg is None, it will not change the current value.
+    """
+    def __init__(
+            self,
+            scriptable: Optional[bool] = None,
+            exportable: Optional[bool] = None,
+            no_jit: Optional[bool] = None,
+            no_activation_jit: Optional[bool] = None):
+        global _SCRIPTABLE
+        global _EXPORTABLE
+        global _NO_JIT
+        global _NO_ACTIVATION_JIT
+        self.prev = _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT
+        if scriptable is not None:
+            _SCRIPTABLE = scriptable
+        if exportable is not None:
+            _EXPORTABLE = exportable
+        if no_jit is not None:
+            _NO_JIT = no_jit
+        if no_activation_jit is not None:
+            _NO_ACTIVATION_JIT = no_activation_jit
+
+    def __enter__(self) -> None:
+        pass
+
+    def __exit__(self, *args: Any) -> bool:
+        global _SCRIPTABLE
+        global _EXPORTABLE
+        global _NO_JIT
+        global _NO_ACTIVATION_JIT
+        _SCRIPTABLE, _EXPORTABLE, _NO_JIT, _NO_ACTIVATION_JIT = self.prev
+        return False
diff --git a/timm/models/quantization/layers/conv2d_same.py b/timm/models/quantization/layers/conv2d_same.py
new file mode 100644
index 00000000..75f0f98d
--- /dev/null
+++ b/timm/models/quantization/layers/conv2d_same.py
@@ -0,0 +1,42 @@
+""" Conv2d w/ Same Padding
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Tuple, Optional
+
+from .padding import pad_same, get_padding_value
+
+
+def conv2d_same(
+        x, weight: torch.Tensor, bias: Optional[torch.Tensor] = None, stride: Tuple[int, int] = (1, 1),
+        padding: Tuple[int, int] = (0, 0), dilation: Tuple[int, int] = (1, 1), groups: int = 1):
+    x = pad_same(x, weight.shape[-2:], stride, dilation)
+    return F.conv2d(x, weight, bias, stride, (0, 0), dilation, groups)
+
+
+class Conv2dSame(nn.Conv2d):
+    """ Tensorflow like 'SAME' convolution wrapper for 2D convolutions
+    """
+
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
+                 padding=0, dilation=1, groups=1, bias=True):
+        super(Conv2dSame, self).__init__(
+            in_channels, out_channels, kernel_size, stride, 0, dilation, groups, bias)
+
+    def forward(self, x):
+        return conv2d_same(x, self.weight, self.bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+def create_conv2d_pad(in_chs, out_chs, kernel_size, **kwargs):
+    padding = kwargs.pop('padding', '')
+    kwargs.setdefault('bias', False)
+    padding, is_dynamic = get_padding_value(padding, kernel_size, **kwargs)
+    if is_dynamic:
+        return Conv2dSame(in_chs, out_chs, kernel_size, **kwargs)
+    else:
+        return nn.Conv2d(in_chs, out_chs, kernel_size, padding=padding, **kwargs)
+
+
diff --git a/timm/models/quantization/layers/conv_bn_act.py b/timm/models/quantization/layers/conv_bn_act.py
new file mode 100644
index 00000000..90735357
--- /dev/null
+++ b/timm/models/quantization/layers/conv_bn_act.py
@@ -0,0 +1,40 @@
+""" Conv2d + BN + Act
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from torch import nn as nn
+
+from .create_conv2d import create_conv2d
+from .create_norm_act import convert_norm_act_type
+
+
+class ConvBnAct(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size=1, stride=1, padding='', dilation=1, groups=1,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None, act_layer=nn.ReLU, apply_act=True,
+                 drop_block=None, aa_layer=None):
+        super(ConvBnAct, self).__init__()
+        use_aa = aa_layer is not None
+
+        self.conv = create_conv2d(
+            in_channels, out_channels, kernel_size, stride=1 if use_aa else stride,
+            padding=padding, dilation=dilation, groups=groups, bias=False)
+
+        # NOTE for backwards compatibility with models that use separate norm and act layer definitions
+        norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs)
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args)
+        self.aa = aa_layer(channels=out_channels) if stride == 2 and use_aa else None
+
+    @property
+    def in_channels(self):
+        return self.conv.in_channels
+
+    @property
+    def out_channels(self):
+        return self.conv.out_channels
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn(x)
+        if self.aa is not None:
+            x = self.aa(x)
+        return x
diff --git a/timm/models/quantization/layers/create_act.py b/timm/models/quantization/layers/create_act.py
new file mode 100644
index 00000000..6787b081
--- /dev/null
+++ b/timm/models/quantization/layers/create_act.py
@@ -0,0 +1,36 @@
+""" Activation Factory
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from .activations import *
+from .activations_jit import *
+from .activations_me import *
+from .config import is_exportable, is_scriptable, is_no_jit
+
+_ACT_LAYER_DEFAULT = dict(
+    swish=Swish,
+    relu=nn.ReLU,
+    relu6=nn.ReLU6,
+    sigmoid=Sigmoid,
+    hard_sigmoid=HardSigmoid,
+    hard_swish=HardSwish,
+)
+
+
+
+def get_act_layer(name='relu'):
+    """ Activation Layer Factory
+    Fetching activation layers by name with this function allows export or torch script friendly
+    functions to be returned dynamically based on current config.
+    """
+    if not name:
+        return None
+        
+    return _ACT_LAYER_DEFAULT[name]
+
+
+def create_act_layer(name, inplace=False, **kwargs):
+    act_layer = get_act_layer(name)
+    if act_layer is not None:
+        return act_layer(inplace=inplace, **kwargs)
+    else:
+        return None
diff --git a/timm/models/quantization/layers/create_conv2d.py b/timm/models/quantization/layers/create_conv2d.py
new file mode 100644
index 00000000..0134b05c
--- /dev/null
+++ b/timm/models/quantization/layers/create_conv2d.py
@@ -0,0 +1,30 @@
+""" Create Conv2d Factory Method
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+from .mixed_conv2d import MixedConv2d
+from .cond_conv2d import CondConv2d
+from .conv2d_same import create_conv2d_pad
+
+
+def create_conv2d(in_channels, out_channels, kernel_size, **kwargs):
+    """ Select a 2d convolution implementation based on arguments
+    Creates and returns one of torch.nn.Conv2d, Conv2dSame, MixedConv2d, or CondConv2d.
+
+    Used extensively by EfficientNet, MobileNetv3 and related networks.
+    """
+    if isinstance(kernel_size, list):
+        assert 'num_experts' not in kwargs  # MixNet + CondConv combo not supported currently
+        assert 'groups' not in kwargs  # MixedConv groups are defined by kernel list
+        # We're going to use only lists for defining the MixedConv2d kernel groups,
+        # ints, tuples, other iterables will continue to pass to normal conv and specify h, w.
+        m = MixedConv2d(in_channels, out_channels, kernel_size, **kwargs)
+    else:
+        depthwise = kwargs.pop('depthwise', False)
+        groups = out_channels if depthwise else kwargs.pop('groups', 1)
+        if 'num_experts' in kwargs and kwargs['num_experts'] > 0:
+            m = CondConv2d(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
+        else:
+            m = create_conv2d_pad(in_channels, out_channels, kernel_size, groups=groups, **kwargs)
+    return m
diff --git a/timm/models/quantization/layers/create_norm_act.py b/timm/models/quantization/layers/create_norm_act.py
new file mode 100644
index 00000000..9e7e529e
--- /dev/null
+++ b/timm/models/quantization/layers/create_norm_act.py
@@ -0,0 +1,74 @@
+""" NormAct (Normalizaiton + Activation Layer) Factory
+
+Create norm + act combo modules that attempt to be backwards compatible with separate norm + act
+isntances in models. Where these are used it will be possible to swap separate BN + act layers with
+combined modules like IABN or EvoNorms.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import types
+import functools
+
+import torch
+import torch.nn as nn
+
+from .evo_norm import EvoNormBatch2d, EvoNormSample2d
+from .norm_act import BatchNormAct2d, GroupNormAct
+from .inplace_abn import InplaceAbn
+
+_NORM_ACT_TYPES = {BatchNormAct2d, GroupNormAct, EvoNormBatch2d, EvoNormSample2d, InplaceAbn}
+_NORM_ACT_REQUIRES_ARG = {BatchNormAct2d, GroupNormAct, InplaceAbn}  # requires act_layer arg to define act type
+
+def get_norm_act_layer(layer_class):
+    layer_class = layer_class.replace('_', '').lower()
+    if layer_class.startswith("batchnorm"):
+        layer = BatchNormAct2d
+    elif layer_class.startswith("groupnorm"):
+        layer = GroupNormAct
+    elif layer_class == "evonormbatch":
+        layer = EvoNormBatch2d
+    elif layer_class == "evonormsample":
+        layer = EvoNormSample2d
+    elif layer_class == "iabn" or layer_class == "inplaceabn":
+        layer = InplaceAbn
+    else:
+        assert False, "Invalid norm_act layer (%s)" % layer_class
+    return layer
+
+
+def create_norm_act(layer_type, num_features, apply_act=True, jit=False, **kwargs):
+    layer_parts = layer_type.split('-')  # e.g. batchnorm-leaky_relu
+    assert len(layer_parts) in (1, 2)
+    layer = get_norm_act_layer(layer_parts[0])
+    #activation_class = layer_parts[1].lower() if len(layer_parts) > 1 else ''   # FIXME support string act selection?
+    layer_instance = layer(num_features, apply_act=apply_act, **kwargs)
+    if jit:
+        layer_instance = torch.jit.script(layer_instance)
+    return layer_instance
+
+
+def convert_norm_act_type(norm_layer, act_layer, norm_kwargs=None):
+    assert isinstance(norm_layer, (type, str,  types.FunctionType, functools.partial))
+    assert act_layer is None or isinstance(act_layer, (type, str, types.FunctionType, functools.partial))
+    norm_act_args = norm_kwargs.copy() if norm_kwargs else {}
+    if isinstance(norm_layer, str):
+        norm_act_layer = get_norm_act_layer(norm_layer)
+    elif norm_layer in _NORM_ACT_TYPES:
+        norm_act_layer = norm_layer
+    elif isinstance(norm_layer,  (types.FunctionType, functools.partial)):
+        # assuming this is a lambda/fn/bound partial that creates norm_act layer
+        norm_act_layer = norm_layer
+    else:
+        type_name = norm_layer.__name__.lower()
+        if type_name.startswith('batchnorm'):
+            norm_act_layer = BatchNormAct2d
+        elif type_name.startswith('groupnorm'):
+            norm_act_layer = GroupNormAct
+        else:
+            assert False, f"No equivalent norm_act layer for {type_name}"
+    if norm_act_layer in _NORM_ACT_REQUIRES_ARG:
+        # Must pass `act_layer` through for backwards compat where `act_layer=None` implies no activation.
+        # In the future, may force use of `apply_act` with `act_layer` arg bound to relevant NormAct types
+        # It is intended that functions/partial does not trigger this, they should define act.
+        norm_act_args.update(dict(act_layer=act_layer))
+    return norm_act_layer, norm_act_args
diff --git a/timm/models/quantization/layers/drop.py b/timm/models/quantization/layers/drop.py
new file mode 100644
index 00000000..625f1e70
--- /dev/null
+++ b/timm/models/quantization/layers/drop.py
@@ -0,0 +1,167 @@
+""" DropBlock, DropPath
+
+PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
+
+Papers:
+DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
+
+Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
+
+Code:
+DropBlock impl inspired by two Tensorflow impl that I liked:
+ - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
+ - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+def drop_block_2d(
+        x, drop_prob: float = 0.1, block_size: int = 7,  gamma_scale: float = 1.0,
+        with_noise: bool = False, inplace: bool = False, batchwise: bool = False):
+    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+
+    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
+    runs with success, but needs further validation and possibly optimization for lower runtime impact.
+    """
+    B, C, H, W = x.shape
+    total_size = W * H
+    clipped_block_size = min(block_size, min(W, H))
+    # seed_drop_rate, the gamma parameter
+    gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
+        (W - block_size + 1) * (H - block_size + 1))
+
+    # Forces the block to be inside the feature map.
+    w_i, h_i = torch.meshgrid(torch.arange(W).to(x.device), torch.arange(H).to(x.device))
+    valid_block = ((w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2)) & \
+                  ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2))
+    valid_block = torch.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype)
+
+    if batchwise:
+        # one mask for whole batch, quite a bit faster
+        uniform_noise = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device)
+    else:
+        uniform_noise = torch.rand_like(x)
+    block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype)
+    block_mask = -F.max_pool2d(
+        -block_mask,
+        kernel_size=clipped_block_size,  # block_size,
+        stride=1,
+        padding=clipped_block_size // 2)
+
+    if with_noise:
+        normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x)
+        if inplace:
+            x.mul_(block_mask).add_(normal_noise * (1 - block_mask))
+        else:
+            x = x * block_mask + normal_noise * (1 - block_mask)
+    else:
+        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(x.dtype)
+        if inplace:
+            x.mul_(block_mask * normalize_scale)
+        else:
+            x = x * block_mask * normalize_scale
+    return x
+
+
+def drop_block_fast_2d(
+        x: torch.Tensor, drop_prob: float = 0.1, block_size: int = 7,
+        gamma_scale: float = 1.0, with_noise: bool = False, inplace: bool = False, batchwise: bool = False):
+    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+
+    DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid
+    block mask at edges.
+    """
+    B, C, H, W = x.shape
+    total_size = W * H
+    clipped_block_size = min(block_size, min(W, H))
+    gamma = gamma_scale * drop_prob * total_size / clipped_block_size ** 2 / (
+            (W - block_size + 1) * (H - block_size + 1))
+
+    if batchwise:
+        # one mask for whole batch, quite a bit faster
+        block_mask = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device) < gamma
+    else:
+        # mask per batch element
+        block_mask = torch.rand_like(x) < gamma
+    block_mask = F.max_pool2d(
+        block_mask.to(x.dtype), kernel_size=clipped_block_size, stride=1, padding=clipped_block_size // 2)
+
+    if with_noise:
+        normal_noise = torch.randn((1, C, H, W), dtype=x.dtype, device=x.device) if batchwise else torch.randn_like(x)
+        if inplace:
+            x.mul_(1. - block_mask).add_(normal_noise * block_mask)
+        else:
+            x = x * (1. - block_mask) + normal_noise * block_mask
+    else:
+        block_mask = 1 - block_mask
+        normalize_scale = (block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)).to(dtype=x.dtype)
+        if inplace:
+            x.mul_(block_mask * normalize_scale)
+        else:
+            x = x * block_mask * normalize_scale
+    return x
+
+
+class DropBlock2d(nn.Module):
+    """ DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
+    """
+    def __init__(self,
+                 drop_prob=0.1,
+                 block_size=7,
+                 gamma_scale=1.0,
+                 with_noise=False,
+                 inplace=False,
+                 batchwise=False,
+                 fast=True):
+        super(DropBlock2d, self).__init__()
+        self.drop_prob = drop_prob
+        self.gamma_scale = gamma_scale
+        self.block_size = block_size
+        self.with_noise = with_noise
+        self.inplace = inplace
+        self.batchwise = batchwise
+        self.fast = fast  # FIXME finish comparisons of fast vs not
+
+    def forward(self, x):
+        if not self.training or not self.drop_prob:
+            return x
+        if self.fast:
+            return drop_block_fast_2d(
+                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise)
+        else:
+            return drop_block_2d(
+                x, self.drop_prob, self.block_size, self.gamma_scale, self.with_noise, self.inplace, self.batchwise)
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    random_tensor = keep_prob + torch.rand((x.size()[0], 1, 1, 1), dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
diff --git a/timm/models/quantization/layers/evo_norm.py b/timm/models/quantization/layers/evo_norm.py
new file mode 100644
index 00000000..9023afd0
--- /dev/null
+++ b/timm/models/quantization/layers/evo_norm.py
@@ -0,0 +1,83 @@
+"""EvoNormB0 (Batched) and EvoNormS0 (Sample) in PyTorch
+
+An attempt at getting decent performing EvoNorms running in PyTorch.
+While currently faster than other impl, still quite a ways off the built-in BN
+in terms of memory usage and throughput (roughly 5x mem, 1/2 - 1/3x speed).
+
+Still very much a WIP, fiddling with buffer usage, in-place/jit optimizations, and layouts.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch
+import torch.nn as nn
+
+
+class EvoNormBatch2d(nn.Module):
+    def __init__(self, num_features, apply_act=True, momentum=0.1, eps=1e-5, drop_block=None):
+        super(EvoNormBatch2d, self).__init__()
+        self.apply_act = apply_act  # apply activation (non-linearity)
+        self.momentum = momentum
+        self.eps = eps
+        param_shape = (1, num_features, 1, 1)
+        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
+        if apply_act:
+            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.register_buffer('running_var', torch.ones(1, num_features, 1, 1))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.ones_(self.weight)
+        nn.init.zeros_(self.bias)
+        if self.apply_act:
+            nn.init.ones_(self.v)
+
+    def forward(self, x):
+        assert x.dim() == 4, 'expected 4D input'
+        x_type = x.dtype
+        if self.training:
+            var = x.var(dim=(0, 2, 3), unbiased=False, keepdim=True)
+            n = x.numel() / x.shape[1]
+            self.running_var.copy_(
+                var.detach() * self.momentum * (n / (n - 1)) + self.running_var * (1 - self.momentum))
+        else:
+            var = self.running_var
+
+        if self.apply_act:
+            v = self.v.to(dtype=x_type)
+            d = x * v + (x.var(dim=(2, 3), unbiased=False, keepdim=True) + self.eps).sqrt().to(dtype=x_type)
+            d = d.max((var + self.eps).sqrt().to(dtype=x_type))
+            x = x / d
+        return x * self.weight + self.bias
+
+
+class EvoNormSample2d(nn.Module):
+    def __init__(self, num_features, apply_act=True, groups=8, eps=1e-5, drop_block=None):
+        super(EvoNormSample2d, self).__init__()
+        self.apply_act = apply_act  # apply activation (non-linearity)
+        self.groups = groups
+        self.eps = eps
+        param_shape = (1, num_features, 1, 1)
+        self.weight = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.bias = nn.Parameter(torch.zeros(param_shape), requires_grad=True)
+        if apply_act:
+            self.v = nn.Parameter(torch.ones(param_shape), requires_grad=True)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.ones_(self.weight)
+        nn.init.zeros_(self.bias)
+        if self.apply_act:
+            nn.init.ones_(self.v)
+
+    def forward(self, x):
+        assert x.dim() == 4, 'expected 4D input'
+        B, C, H, W = x.shape
+        assert C % self.groups == 0
+        if self.apply_act:
+            n = x * (x * self.v).sigmoid()
+            x = x.reshape(B, self.groups, -1)
+            x = n.reshape(B, self.groups, -1) / (x.var(dim=-1, unbiased=False, keepdim=True) + self.eps).sqrt()
+            x = x.reshape(B, C, H, W)
+        return x * self.weight + self.bias
diff --git a/timm/models/quantization/layers/helpers.py b/timm/models/quantization/layers/helpers.py
new file mode 100644
index 00000000..d9aec8af
--- /dev/null
+++ b/timm/models/quantization/layers/helpers.py
@@ -0,0 +1,27 @@
+""" Layer/Module Helpers
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from itertools import repeat
+from torch._six import container_abcs
+
+
+# From PyTorch internals
+def _ntuple(n):
+    def parse(x):
+        if isinstance(x, container_abcs.Iterable):
+            return x
+        return tuple(repeat(x, n))
+    return parse
+
+
+tup_single = _ntuple(1)
+tup_pair = _ntuple(2)
+tup_triple = _ntuple(3)
+tup_quadruple = _ntuple(4)
+ntup = _ntuple
+
+
+
+
+
diff --git a/timm/models/quantization/layers/inplace_abn.py b/timm/models/quantization/layers/inplace_abn.py
new file mode 100644
index 00000000..c7edac62
--- /dev/null
+++ b/timm/models/quantization/layers/inplace_abn.py
@@ -0,0 +1,87 @@
+import torch
+from torch import nn as nn
+
+try:
+    from inplace_abn.functions import inplace_abn, inplace_abn_sync
+    has_iabn = True
+except ImportError:
+    has_iabn = False
+
+    def inplace_abn(x, weight, bias, running_mean, running_var,
+                    training=True, momentum=0.1, eps=1e-05, activation="leaky_relu", activation_param=0.01):
+        raise ImportError(
+            "Please install InplaceABN:'pip install git+https://github.com/mapillary/inplace_abn.git@v1.0.11'")
+
+    def inplace_abn_sync(**kwargs):
+        inplace_abn(**kwargs)
+
+
+class InplaceAbn(nn.Module):
+    """Activated Batch Normalization
+
+    This gathers a BatchNorm and an activation function in a single module
+
+    Parameters
+    ----------
+    num_features : int
+        Number of feature channels in the input and output.
+    eps : float
+        Small constant to prevent numerical issues.
+    momentum : float
+        Momentum factor applied to compute running statistics.
+    affine : bool
+        If `True` apply learned scale and shift transformation after normalization.
+    act_layer : str or nn.Module type
+        Name or type of the activation functions, one of: `leaky_relu`, `elu`
+    act_param : float
+        Negative slope for the `leaky_relu` activation.
+    """
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, apply_act=True,
+                 act_layer="leaky_relu", act_param=0.01, drop_block=None):
+        super(InplaceAbn, self).__init__()
+        self.num_features = num_features
+        self.affine = affine
+        self.eps = eps
+        self.momentum = momentum
+        if apply_act:
+            if isinstance(act_layer, str):
+                assert act_layer in ('leaky_relu', 'elu', 'identity', '')
+                self.act_name = act_layer if act_layer else 'identity'
+            else:
+                # convert act layer passed as type to string
+                if act_layer == nn.ELU:
+                    self.act_name = 'elu'
+                elif act_layer == nn.LeakyReLU:
+                    self.act_name = 'leaky_relu'
+                elif act_layer == nn.Identity:
+                    self.act_name = 'identity'
+                else:
+                    assert False, f'Invalid act layer {act_layer.__name__} for IABN'
+        else:
+            self.act_name = 'identity'
+        self.act_param = act_param
+        if self.affine:
+            self.weight = nn.Parameter(torch.ones(num_features))
+            self.bias = nn.Parameter(torch.zeros(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        self.register_buffer('running_mean', torch.zeros(num_features))
+        self.register_buffer('running_var', torch.ones(num_features))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.constant_(self.running_mean, 0)
+        nn.init.constant_(self.running_var, 1)
+        if self.affine:
+            nn.init.constant_(self.weight, 1)
+            nn.init.constant_(self.bias, 0)
+
+    def forward(self, x):
+        output = inplace_abn(
+            x, self.weight, self.bias, self.running_mean, self.running_var,
+            self.training, self.momentum, self.eps, self.act_name, self.act_param)
+        if isinstance(output, tuple):
+            output = output[0]
+        return output
diff --git a/timm/models/quantization/layers/median_pool.py b/timm/models/quantization/layers/median_pool.py
new file mode 100644
index 00000000..f900229f
--- /dev/null
+++ b/timm/models/quantization/layers/median_pool.py
@@ -0,0 +1,49 @@
+""" Median Pool
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch.nn as nn
+import torch.nn.functional as F
+from .helpers import tup_pair, tup_quadruple
+
+
+class MedianPool2d(nn.Module):
+    """ Median pool (usable as median filter when stride=1) module.
+
+    Args:
+         kernel_size: size of pooling kernel, int or 2-tuple
+         stride: pool stride, int or 2-tuple
+         padding: pool padding, int or 4-tuple (l, r, t, b) as in pytorch F.pad
+         same: override padding and enforce same padding, boolean
+    """
+    def __init__(self, kernel_size=3, stride=1, padding=0, same=False):
+        super(MedianPool2d, self).__init__()
+        self.k = tup_pair(kernel_size)
+        self.stride = tup_pair(stride)
+        self.padding = tup_quadruple(padding)  # convert to l, r, t, b
+        self.same = same
+
+    def _padding(self, x):
+        if self.same:
+            ih, iw = x.size()[2:]
+            if ih % self.stride[0] == 0:
+                ph = max(self.k[0] - self.stride[0], 0)
+            else:
+                ph = max(self.k[0] - (ih % self.stride[0]), 0)
+            if iw % self.stride[1] == 0:
+                pw = max(self.k[1] - self.stride[1], 0)
+            else:
+                pw = max(self.k[1] - (iw % self.stride[1]), 0)
+            pl = pw // 2
+            pr = pw - pl
+            pt = ph // 2
+            pb = ph - pt
+            padding = (pl, pr, pt, pb)
+        else:
+            padding = self.padding
+        return padding
+
+    def forward(self, x):
+        x = F.pad(x, self._padding(x), mode='reflect')
+        x = x.unfold(2, self.k[0], self.stride[0]).unfold(3, self.k[1], self.stride[1])
+        x = x.contiguous().view(x.size()[:4] + (-1,)).median(dim=-1)[0]
+        return x
diff --git a/timm/models/quantization/layers/mixed_conv2d.py b/timm/models/quantization/layers/mixed_conv2d.py
new file mode 100644
index 00000000..53d650cd
--- /dev/null
+++ b/timm/models/quantization/layers/mixed_conv2d.py
@@ -0,0 +1,51 @@
+""" PyTorch Mixed Convolution
+
+Paper: MixConv: Mixed Depthwise Convolutional Kernels (https://arxiv.org/abs/1907.09595)
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import torch
+from torch import nn as nn
+
+from .conv2d_same import create_conv2d_pad
+
+
+def _split_channels(num_chan, num_groups):
+    split = [num_chan // num_groups for _ in range(num_groups)]
+    split[0] += num_chan - sum(split)
+    return split
+
+
+class MixedConv2d(nn.ModuleDict):
+    """ Mixed Grouped Convolution
+
+    Based on MDConv and GroupedConv in MixNet impl:
+      https://github.com/tensorflow/tpu/blob/master/models/official/mnasnet/mixnet/custom_layers.py
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3,
+                 stride=1, padding='', dilation=1, depthwise=False, **kwargs):
+        super(MixedConv2d, self).__init__()
+
+        kernel_size = kernel_size if isinstance(kernel_size, list) else [kernel_size]
+        num_groups = len(kernel_size)
+        in_splits = _split_channels(in_channels, num_groups)
+        out_splits = _split_channels(out_channels, num_groups)
+        self.in_channels = sum(in_splits)
+        self.out_channels = sum(out_splits)
+        for idx, (k, in_ch, out_ch) in enumerate(zip(kernel_size, in_splits, out_splits)):
+            conv_groups = out_ch if depthwise else 1
+            # use add_module to keep key space clean
+            self.add_module(
+                str(idx),
+                create_conv2d_pad(
+                    in_ch, out_ch, k, stride=stride,
+                    padding=padding, dilation=dilation, groups=conv_groups, **kwargs)
+            )
+        self.splits = in_splits
+
+    def forward(self, x):
+        x_split = torch.split(x, self.splits, 1)
+        x_out = [c(x_split[i]) for i, c in enumerate(self.values())]
+        x = torch.cat(x_out, 1)
+        return x
diff --git a/timm/models/quantization/layers/norm_act.py b/timm/models/quantization/layers/norm_act.py
new file mode 100644
index 00000000..bddf9b26
--- /dev/null
+++ b/timm/models/quantization/layers/norm_act.py
@@ -0,0 +1,86 @@
+""" Normalization + Activation Layers
+"""
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+from .create_act import get_act_layer
+
+
+class BatchNormAct2d(nn.BatchNorm2d):
+    """BatchNorm + Activation
+
+    This module performs BatchNorm + Activation in a manner that will remain backwards
+    compatible with weights trained with separate bn, act. This is why we inherit from BN
+    instead of composing it as a .bn member.
+    """
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True, track_running_stats=True,
+                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
+        super(BatchNormAct2d, self).__init__(
+            num_features, eps=eps, momentum=momentum, affine=affine, track_running_stats=track_running_stats)
+        if isinstance(act_layer, str):
+            act_layer = get_act_layer(act_layer)
+        if act_layer is not None and apply_act:
+            act_args = dict(inplace=True) if inplace else {}
+            self.act = act_layer(**act_args)
+        else:
+            self.act = None
+
+    def _forward_jit(self, x):
+        """ A cut & paste of the contents of the PyTorch BatchNorm2d forward function
+        """
+        # exponential_average_factor is self.momentum set to
+        # (when it is available) only so that if gets updated
+        # in ONNX graph when this node is exported to ONNX.
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            # TODO: if statement only here to tell the jit to skip emitting this when it is None
+            if self.num_batches_tracked is not None:
+                self.num_batches_tracked += 1
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        x = F.batch_norm(
+                x, self.running_mean, self.running_var, self.weight, self.bias,
+                self.training or not self.track_running_stats,
+                exponential_average_factor, self.eps)
+        return x
+
+    @torch.jit.ignore
+    def _forward_python(self, x):
+        return super(BatchNormAct2d, self).forward(x)
+
+    def forward(self, x):
+        # FIXME cannot call parent forward() and maintain jit.script compatibility?
+        if torch.jit.is_scripting():
+            x = self._forward_jit(x)
+        else:
+            x = self._forward_python(x)
+        if self.act is not None:
+            x = self.act(x)
+        return x
+
+
+class GroupNormAct(nn.GroupNorm):
+
+    def __init__(self, num_groups, num_channels, eps=1e-5, affine=True,
+                 apply_act=True, act_layer=nn.ReLU, inplace=True, drop_block=None):
+        super(GroupNormAct, self).__init__(num_groups, num_channels, eps=eps, affine=affine)
+        if isinstance(act_layer, str):
+            act_layer = get_act_layer(act_layer)
+        if act_layer is not None and apply_act:
+            self.act = act_layer(inplace=inplace)
+        else:
+            self.act = None
+
+    def forward(self, x):
+        x = F.group_norm(x, self.num_groups, self.weight, self.bias, self.eps)
+        if self.act is not None:
+            x = self.act(x)
+        return x
diff --git a/timm/models/quantization/layers/padding.py b/timm/models/quantization/layers/padding.py
new file mode 100644
index 00000000..34afc37c
--- /dev/null
+++ b/timm/models/quantization/layers/padding.py
@@ -0,0 +1,56 @@
+""" Padding Helpers
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import math
+from typing import List, Tuple
+
+import torch.nn.functional as F
+
+
+# Calculate symmetric padding for a convolution
+def get_padding(kernel_size: int, stride: int = 1, dilation: int = 1, **_) -> int:
+    padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2
+    return padding
+
+
+# Calculate asymmetric TensorFlow-like 'SAME' padding for a convolution
+def get_same_padding(x: int, k: int, s: int, d: int):
+    return max((math.ceil(x / s) - 1) * s + (k - 1) * d + 1 - x, 0)
+
+
+# Can SAME padding for given args be done statically?
+def is_static_pad(kernel_size: int, stride: int = 1, dilation: int = 1, **_):
+    return stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0
+
+
+# Dynamically pad input x with 'SAME' padding for conv with specified args
+def pad_same(x, k: List[int], s: List[int], d: List[int] = (1, 1), value: float = 0):
+    ih, iw = x.size()[-2:]
+    pad_h, pad_w = get_same_padding(ih, k[0], s[0], d[0]), get_same_padding(iw, k[1], s[1], d[1])
+    if pad_h > 0 or pad_w > 0:
+        x = F.pad(x, [pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2], value=value)
+    return x
+
+
+def get_padding_value(padding, kernel_size, **kwargs) -> Tuple[Tuple, bool]:
+    dynamic = False
+    if isinstance(padding, str):
+        # for any string padding, the padding will be calculated for you, one of three ways
+        padding = padding.lower()
+        if padding == 'same':
+            # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact
+            if is_static_pad(kernel_size, **kwargs):
+                # static case, no extra overhead
+                padding = get_padding(kernel_size, **kwargs)
+            else:
+                # dynamic 'SAME' padding, has runtime/GPU memory overhead
+                padding = 0
+                dynamic = True
+        elif padding == 'valid':
+            # 'VALID' padding, same as padding=0
+            padding = 0
+        else:
+            # Default to PyTorch style 'same'-ish symmetric padding
+            padding = get_padding(kernel_size, **kwargs)
+    return padding, dynamic
diff --git a/timm/models/quantization/layers/pool2d_same.py b/timm/models/quantization/layers/pool2d_same.py
new file mode 100644
index 00000000..2e61b426
--- /dev/null
+++ b/timm/models/quantization/layers/pool2d_same.py
@@ -0,0 +1,71 @@
+""" AvgPool2d w/ Same Padding
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import List, Tuple, Optional
+
+from .helpers import tup_pair
+from .padding import pad_same, get_padding_value
+
+
+def avg_pool2d_same(x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
+                    ceil_mode: bool = False, count_include_pad: bool = True):
+    # FIXME how to deal with count_include_pad vs not for external padding?
+    x = pad_same(x, kernel_size, stride)
+    return F.avg_pool2d(x, kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
+
+
+class AvgPool2dSame(nn.AvgPool2d):
+    """ Tensorflow like 'SAME' wrapper for 2D average pooling
+    """
+    def __init__(self, kernel_size: int, stride=None, padding=0, ceil_mode=False, count_include_pad=True):
+        kernel_size = tup_pair(kernel_size)
+        stride = tup_pair(stride)
+        super(AvgPool2dSame, self).__init__(kernel_size, stride, (0, 0), ceil_mode, count_include_pad)
+
+    def forward(self, x):
+        return avg_pool2d_same(
+            x, self.kernel_size, self.stride, self.padding, self.ceil_mode, self.count_include_pad)
+
+
+def max_pool2d_same(
+        x, kernel_size: List[int], stride: List[int], padding: List[int] = (0, 0),
+        dilation: List[int] = (1, 1), ceil_mode: bool = False):
+    x = pad_same(x, kernel_size, stride, value=-float('inf'))
+    return F.max_pool2d(x, kernel_size, stride, (0, 0), dilation, ceil_mode)
+
+
+class MaxPool2dSame(nn.MaxPool2d):
+    """ Tensorflow like 'SAME' wrapper for 2D max pooling
+    """
+    def __init__(self, kernel_size: int, stride=None, padding=0, dilation=1, ceil_mode=False, count_include_pad=True):
+        kernel_size = tup_pair(kernel_size)
+        stride = tup_pair(stride)
+        dilation = tup_pair(dilation)
+        super(MaxPool2dSame, self).__init__(kernel_size, stride, (0, 0), dilation, ceil_mode, count_include_pad)
+
+    def forward(self, x):
+        return max_pool2d_same(x, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode)
+
+
+def create_pool2d(pool_type, kernel_size, stride=None, **kwargs):
+    stride = stride or kernel_size
+    padding = kwargs.pop('padding', '')
+    padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, **kwargs)
+    if is_dynamic:
+        if pool_type == 'avg':
+            return AvgPool2dSame(kernel_size, stride=stride, **kwargs)
+        elif pool_type == 'max':
+            return MaxPool2dSame(kernel_size, stride=stride, **kwargs)
+        else:
+            assert False, f'Unsupported pool type {pool_type}'
+    else:
+        if pool_type == 'avg':
+            return nn.AvgPool2d(kernel_size, stride=stride, padding=padding, **kwargs)
+        elif pool_type == 'max':
+            return nn.MaxPool2d(kernel_size, stride=stride, padding=padding, **kwargs)
+        else:
+            assert False, f'Unsupported pool type {pool_type}'
diff --git a/timm/models/quantization/layers/selective_kernel.py b/timm/models/quantization/layers/selective_kernel.py
new file mode 100644
index 00000000..2efaa487
--- /dev/null
+++ b/timm/models/quantization/layers/selective_kernel.py
@@ -0,0 +1,120 @@
+""" Selective Kernel Convolution/Attention
+
+Paper: Selective Kernel Networks (https://arxiv.org/abs/1903.06586)
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+from torch import nn as nn
+
+from .conv_bn_act import ConvBnAct
+
+
+def _kernel_valid(k):
+    if isinstance(k, (list, tuple)):
+        for ki in k:
+            return _kernel_valid(ki)
+    assert k >= 3 and k % 2
+
+
+class SelectiveKernelAttn(nn.Module):
+    def __init__(self, channels, num_paths=2, attn_channels=32,
+                 act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d):
+        """ Selective Kernel Attention Module
+
+        Selective Kernel attention mechanism factored out into its own module.
+
+        """
+        super(SelectiveKernelAttn, self).__init__()
+        self.num_paths = num_paths
+        self.pool = nn.AdaptiveAvgPool2d(1)
+        self.fc_reduce = nn.Conv2d(channels, attn_channels, kernel_size=1, bias=False)
+        self.bn = norm_layer(attn_channels)
+        self.act = act_layer(inplace=True)
+        self.fc_select = nn.Conv2d(attn_channels, channels * num_paths, kernel_size=1, bias=False)
+
+    def forward(self, x):
+        assert x.shape[1] == self.num_paths
+        x = torch.sum(x, dim=1)
+        x = self.pool(x)
+        x = self.fc_reduce(x)
+        x = self.bn(x)
+        x = self.act(x)
+        x = self.fc_select(x)
+        B, C, H, W = x.shape
+        x = x.view(B, self.num_paths, C // self.num_paths, H, W)
+        x = torch.softmax(x, dim=1)
+        return x
+
+
+class SelectiveKernelConv(nn.Module):
+
+    def __init__(self, in_channels, out_channels, kernel_size=None, stride=1, dilation=1, groups=1,
+                 attn_reduction=16, min_attn_channels=32, keep_3x3=True, split_input=False,
+                 drop_block=None, act_layer=nn.ReLU, norm_layer=nn.BatchNorm2d, aa_layer=None):
+        """ Selective Kernel Convolution Module
+
+        As described in Selective Kernel Networks (https://arxiv.org/abs/1903.06586) with some modifications.
+
+        Largest change is the input split, which divides the input channels across each convolution path, this can
+        be viewed as a grouping of sorts, but the output channel counts expand to the module level value. This keeps
+        the parameter count from ballooning when the convolutions themselves don't have groups, but still provides
+        a noteworthy increase in performance over similar param count models without this attention layer. -Ross W
+
+        Args:
+            in_channels (int):  module input (feature) channel count
+            out_channels (int):  module output (feature) channel count
+            kernel_size (int, list): kernel size for each convolution branch
+            stride (int): stride for convolutions
+            dilation (int): dilation for module as a whole, impacts dilation of each branch
+            groups (int): number of groups for each branch
+            attn_reduction (int, float): reduction factor for attention features
+            min_attn_channels (int): minimum attention feature channels
+            keep_3x3 (bool): keep all branch convolution kernels as 3x3, changing larger kernels for dilations
+            split_input (bool): split input channels evenly across each convolution branch, keeps param count lower,
+                can be viewed as grouping by path, output expands to module out_channels count
+            drop_block (nn.Module): drop block module
+            act_layer (nn.Module): activation layer to use
+            norm_layer (nn.Module): batchnorm/norm layer to use
+        """
+        super(SelectiveKernelConv, self).__init__()
+        kernel_size = kernel_size or [3, 5]  # default to one 3x3 and one 5x5 branch. 5x5 -> 3x3 + dilation
+        _kernel_valid(kernel_size)
+        if not isinstance(kernel_size, list):
+            kernel_size = [kernel_size] * 2
+        if keep_3x3:
+            dilation = [dilation * (k - 1) // 2 for k in kernel_size]
+            kernel_size = [3] * len(kernel_size)
+        else:
+            dilation = [dilation] * len(kernel_size)
+        self.num_paths = len(kernel_size)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.split_input = split_input
+        if self.split_input:
+            assert in_channels % self.num_paths == 0
+            in_channels = in_channels // self.num_paths
+        groups = min(out_channels, groups)
+
+        conv_kwargs = dict(
+            stride=stride, groups=groups, drop_block=drop_block, act_layer=act_layer, norm_layer=norm_layer,
+            aa_layer=aa_layer)
+        self.paths = nn.ModuleList([
+            ConvBnAct(in_channels, out_channels, kernel_size=k, dilation=d, **conv_kwargs)
+            for k, d in zip(kernel_size, dilation)])
+
+        attn_channels = max(int(out_channels / attn_reduction), min_attn_channels)
+        self.attn = SelectiveKernelAttn(out_channels, self.num_paths, attn_channels)
+        self.drop_block = drop_block
+
+    def forward(self, x):
+        if self.split_input:
+            x_split = torch.split(x, self.in_channels // self.num_paths, 1)
+            x_paths = [op(x_split[i]) for i, op in enumerate(self.paths)]
+        else:
+            x_paths = [op(x) for op in self.paths]
+        x = torch.stack(x_paths, dim=1)
+        x_attn = self.attn(x)
+        x = x * x_attn
+        x = torch.sum(x, dim=1)
+        return x
diff --git a/timm/models/quantization/layers/separable_conv.py b/timm/models/quantization/layers/separable_conv.py
new file mode 100644
index 00000000..e949ea43
--- /dev/null
+++ b/timm/models/quantization/layers/separable_conv.py
@@ -0,0 +1,74 @@
+""" Depthwise Separable Conv Modules
+
+Basic DWS convs. Other variations of DWS exist with batch norm or activations between the
+DW and PW convs such as the Depthwise modules in MobileNetV2 / EfficientNet and Xception.
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+from torch import nn as nn
+
+from .create_conv2d import create_conv2d
+from .create_norm_act import convert_norm_act_type
+
+
+class SeparableConvBnAct(nn.Module):
+    """ Separable Conv w/ trailing Norm and Activation
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
+                 channel_multiplier=1.0, pw_kernel_size=1, norm_layer=nn.BatchNorm2d, norm_kwargs=None,
+                 act_layer=nn.ReLU, apply_act=True, drop_block=None):
+        super(SeparableConvBnAct, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+
+        self.conv_dw = create_conv2d(
+            in_channels, int(in_channels * channel_multiplier), kernel_size,
+            stride=stride, dilation=dilation, padding=padding, depthwise=True)
+
+        self.conv_pw = create_conv2d(
+            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
+
+        norm_act_layer, norm_act_args = convert_norm_act_type(norm_layer, act_layer, norm_kwargs)
+        self.bn = norm_act_layer(out_channels, apply_act=apply_act, drop_block=drop_block, **norm_act_args)
+
+    @property
+    def in_channels(self):
+        return self.conv_dw.in_channels
+
+    @property
+    def out_channels(self):
+        return self.conv_pw.out_channels
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        if self.bn is not None:
+            x = self.bn(x)
+        return x
+
+
+class SeparableConv2d(nn.Module):
+    """ Separable Conv
+    """
+    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1, dilation=1, padding='', bias=False,
+                 channel_multiplier=1.0, pw_kernel_size=1):
+        super(SeparableConv2d, self).__init__()
+
+        self.conv_dw = create_conv2d(
+            in_channels, int(in_channels * channel_multiplier), kernel_size,
+            stride=stride, dilation=dilation, padding=padding, depthwise=True)
+
+        self.conv_pw = create_conv2d(
+            int(in_channels * channel_multiplier), out_channels, pw_kernel_size, padding=padding, bias=bias)
+
+    @property
+    def in_channels(self):
+        return self.conv_dw.in_channels
+
+    @property
+    def out_channels(self):
+        return self.conv_pw.out_channels
+
+    def forward(self, x):
+        x = self.conv_dw(x)
+        x = self.conv_pw(x)
+        return x
diff --git a/timm/models/quantization/layers/space_to_depth.py b/timm/models/quantization/layers/space_to_depth.py
new file mode 100644
index 00000000..a7e8e0b2
--- /dev/null
+++ b/timm/models/quantization/layers/space_to_depth.py
@@ -0,0 +1,53 @@
+import torch
+import torch.nn as nn
+
+
+class SpaceToDepth(nn.Module):
+    def __init__(self, block_size=4):
+        super().__init__()
+        assert block_size == 4
+        self.bs = block_size
+
+    def forward(self, x):
+        N, C, H, W = x.size()
+        x = x.view(N, C, H // self.bs, self.bs, W // self.bs, self.bs)  # (N, C, H//bs, bs, W//bs, bs)
+        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # (N, bs, bs, C, H//bs, W//bs)
+        x = x.view(N, C * (self.bs ** 2), H // self.bs, W // self.bs)  # (N, C*bs^2, H//bs, W//bs)
+        return x
+
+
+@torch.jit.script
+class SpaceToDepthJit(object):
+    def __call__(self, x: torch.Tensor):
+        # assuming hard-coded that block_size==4 for acceleration
+        N, C, H, W = x.size()
+        x = x.view(N, C, H // 4, 4, W // 4, 4)  # (N, C, H//bs, bs, W//bs, bs)
+        x = x.permute(0, 3, 5, 1, 2, 4).contiguous()  # (N, bs, bs, C, H//bs, W//bs)
+        x = x.view(N, C * 16, H // 4, W // 4)  # (N, C*bs^2, H//bs, W//bs)
+        return x
+
+
+class SpaceToDepthModule(nn.Module):
+    def __init__(self, no_jit=False):
+        super().__init__()
+        if not no_jit:
+            self.op = SpaceToDepthJit()
+        else:
+            self.op = SpaceToDepth()
+
+    def forward(self, x):
+        return self.op(x)
+
+
+class DepthToSpace(nn.Module):
+
+    def __init__(self, block_size):
+        super().__init__()
+        self.bs = block_size
+
+    def forward(self, x):
+        N, C, H, W = x.size()
+        x = x.view(N, self.bs, self.bs, C // (self.bs ** 2), H, W)  # (N, bs, bs, C//bs^2, H, W)
+        x = x.permute(0, 3, 4, 1, 5, 2).contiguous()  # (N, C//bs^2, H, bs, W, bs)
+        x = x.view(N, C // (self.bs ** 2), H * self.bs, W * self.bs)  # (N, C//bs^2, H * bs, W * bs)
+        return x
diff --git a/timm/models/quantization/layers/split_attn.py b/timm/models/quantization/layers/split_attn.py
new file mode 100644
index 00000000..5615aa0b
--- /dev/null
+++ b/timm/models/quantization/layers/split_attn.py
@@ -0,0 +1,88 @@
+""" Split Attention Conv2d (for ResNeSt Models)
+
+Paper: `ResNeSt: Split-Attention Networks` - /https://arxiv.org/abs/2004.08955
+
+Adapted from original PyTorch impl at https://github.com/zhanghang1989/ResNeSt
+
+Modified for torchscript compat, performance, and consistency with timm by Ross Wightman
+"""
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class RadixSoftmax(nn.Module):
+    def __init__(self, radix, cardinality):
+        super(RadixSoftmax, self).__init__()
+        self.radix = radix
+        self.cardinality = cardinality
+
+    def forward(self, x):
+        batch = x.size(0)
+        if self.radix > 1:
+            x = x.view(batch, self.cardinality, self.radix, -1).transpose(1, 2)
+            x = F.softmax(x, dim=1)
+            x = x.reshape(batch, -1)
+        else:
+            x = torch.sigmoid(x)
+        return x
+
+
+class SplitAttnConv2d(nn.Module):
+    """Split-Attention Conv2d
+    """
+    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=0,
+                 dilation=1, groups=1, bias=False, radix=2, reduction_factor=4,
+                 act_layer=nn.ReLU, norm_layer=None, drop_block=None, **kwargs):
+        super(SplitAttnConv2d, self).__init__()
+        self.radix = radix
+        self.drop_block = drop_block
+        mid_chs = out_channels * radix
+        attn_chs = max(in_channels * radix // reduction_factor, 32)
+
+        self.conv = nn.Conv2d(
+            in_channels, mid_chs, kernel_size, stride, padding, dilation,
+            groups=groups * radix, bias=bias, **kwargs)
+        self.bn0 = norm_layer(mid_chs) if norm_layer is not None else None
+        self.act0 = act_layer(inplace=True)
+        self.fc1 = nn.Conv2d(out_channels, attn_chs, 1, groups=groups)
+        self.bn1 = norm_layer(attn_chs) if norm_layer is not None else None
+        self.act1 = act_layer(inplace=True)
+        self.fc2 = nn.Conv2d(attn_chs, mid_chs, 1, groups=groups)
+        self.rsoftmax = RadixSoftmax(radix, groups)
+
+    @property
+    def in_channels(self):
+        return self.conv.in_channels
+
+    @property
+    def out_channels(self):
+        return self.fc1.out_channels
+
+    def forward(self, x):
+        x = self.conv(x)
+        if self.bn0 is not None:
+            x = self.bn0(x)
+        if self.drop_block is not None:
+            x = self.drop_block(x)
+        x = self.act0(x)
+
+        B, RC, H, W = x.shape
+        if self.radix > 1:
+            x = x.reshape((B, self.radix, RC // self.radix, H, W))
+            x_gap = x.sum(dim=1)
+        else:
+            x_gap = x
+        x_gap = F.adaptive_avg_pool2d(x_gap, 1)
+        x_gap = self.fc1(x_gap)
+        if self.bn1 is not None:
+            x_gap = self.bn1(x_gap)
+        x_gap = self.act1(x_gap)
+        x_attn = self.fc2(x_gap)
+
+        x_attn = self.rsoftmax(x_attn).view(B, -1, 1, 1)
+        if self.radix > 1:
+            out = (x * x_attn.reshape((B, self.radix, RC // self.radix, 1, 1))).sum(dim=1)
+        else:
+            out = x * x_attn
+        return out.contiguous()
diff --git a/timm/models/quantization/layers/split_batchnorm.py b/timm/models/quantization/layers/split_batchnorm.py
new file mode 100644
index 00000000..830781b3
--- /dev/null
+++ b/timm/models/quantization/layers/split_batchnorm.py
@@ -0,0 +1,75 @@
+""" Split BatchNorm
+
+A PyTorch BatchNorm layer that splits input batch into N equal parts and passes each through
+a separate BN layer. The first split is passed through the parent BN layers with weight/bias
+keys the same as the original BN. All other splits pass through BN sub-layers under the '.aux_bn'
+namespace.
+
+This allows easily removing the auxiliary BN layers after training to efficiently
+achieve the 'Auxiliary BatchNorm' as described in the AdvProp Paper, section 4.2,
+'Disentangled Learning via An Auxiliary BN'
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+
+
+class SplitBatchNorm2d(torch.nn.BatchNorm2d):
+
+    def __init__(self, num_features, eps=1e-5, momentum=0.1, affine=True,
+                 track_running_stats=True, num_splits=2):
+        super().__init__(num_features, eps, momentum, affine, track_running_stats)
+        assert num_splits > 1, 'Should have at least one aux BN layer (num_splits at least 2)'
+        self.num_splits = num_splits
+        self.aux_bn = nn.ModuleList([
+            nn.BatchNorm2d(num_features, eps, momentum, affine, track_running_stats) for _ in range(num_splits - 1)])
+
+    def forward(self, input: torch.Tensor):
+        if self.training:  # aux BN only relevant while training
+            split_size = input.shape[0] // self.num_splits
+            assert input.shape[0] == split_size * self.num_splits, "batch size must be evenly divisible by num_splits"
+            split_input = input.split(split_size)
+            x = [super().forward(split_input[0])]
+            for i, a in enumerate(self.aux_bn):
+                x.append(a(split_input[i + 1]))
+            return torch.cat(x, dim=0)
+        else:
+            return super().forward(input)
+
+
+def convert_splitbn_model(module, num_splits=2):
+    """
+    Recursively traverse module and its children to replace all instances of
+    ``torch.nn.modules.batchnorm._BatchNorm`` with `SplitBatchnorm2d`.
+    Args:
+        module (torch.nn.Module): input module
+        num_splits: number of separate batchnorm layers to split input across
+    Example::
+        >>> # model is an instance of torch.nn.Module
+        >>> model = timm.models.convert_splitbn_model(model, num_splits=2)
+    """
+    mod = module
+    if isinstance(module, torch.nn.modules.instancenorm._InstanceNorm):
+        return module
+    if isinstance(module, torch.nn.modules.batchnorm._BatchNorm):
+        mod = SplitBatchNorm2d(
+            module.num_features, module.eps, module.momentum, module.affine,
+            module.track_running_stats, num_splits=num_splits)
+        mod.running_mean = module.running_mean
+        mod.running_var = module.running_var
+        mod.num_batches_tracked = module.num_batches_tracked
+        if module.affine:
+            mod.weight.data = module.weight.data.clone().detach()
+            mod.bias.data = module.bias.data.clone().detach()
+        for aux in mod.aux_bn:
+            aux.running_mean = module.running_mean.clone()
+            aux.running_var = module.running_var.clone()
+            aux.num_batches_tracked = module.num_batches_tracked.clone()
+            if module.affine:
+                aux.weight.data = module.weight.data.clone().detach()
+                aux.bias.data = module.bias.data.clone().detach()
+    for name, child in module.named_children():
+        mod.add_module(name, convert_splitbn_model(child, num_splits=num_splits))
+    del module
+    return mod
diff --git a/timm/models/quantization/layers/test_time_pool.py b/timm/models/quantization/layers/test_time_pool.py
new file mode 100644
index 00000000..642f4e4b
--- /dev/null
+++ b/timm/models/quantization/layers/test_time_pool.py
@@ -0,0 +1,50 @@
+""" Test Time Pooling (Average-Max Pool)
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+
+import logging
+from torch import nn
+import torch.nn.functional as F
+
+from .adaptive_avgmax_pool import adaptive_avgmax_pool2d
+
+
+_logger = logging.getLogger(__name__)
+
+
+class TestTimePoolHead(nn.Module):
+    def __init__(self, base, original_pool=7):
+        super(TestTimePoolHead, self).__init__()
+        self.base = base
+        self.original_pool = original_pool
+        base_fc = self.base.get_classifier()
+        if isinstance(base_fc, nn.Conv2d):
+            self.fc = base_fc
+        else:
+            self.fc = nn.Conv2d(
+                self.base.num_features, self.base.num_classes, kernel_size=1, bias=True)
+            self.fc.weight.data.copy_(base_fc.weight.data.view(self.fc.weight.size()))
+            self.fc.bias.data.copy_(base_fc.bias.data.view(self.fc.bias.size()))
+        self.base.reset_classifier(0)  # delete original fc layer
+
+    def forward(self, x):
+        x = self.base.forward_features(x)
+        x = F.avg_pool2d(x, kernel_size=self.original_pool, stride=1)
+        x = self.fc(x)
+        x = adaptive_avgmax_pool2d(x, 1)
+        return x.view(x.size(0), -1)
+
+
+def apply_test_time_pool(model, config, args):
+    test_time_pool = False
+    if not hasattr(model, 'default_cfg') or not model.default_cfg:
+        return model, False
+    if not args.no_test_pool and \
+            config['input_size'][-1] > model.default_cfg['input_size'][-1] and \
+            config['input_size'][-2] > model.default_cfg['input_size'][-2]:
+        _logger.info('Target input size %s > pretrained default %s, using test time pooling' %
+                     (str(config['input_size'][-2:]), str(model.default_cfg['input_size'][-2:])))
+        model = TestTimePoolHead(model, original_pool=model.default_cfg['pool_size'])
+        test_time_pool = True
+    return model, test_time_pool
diff --git a/timm/models/quantization/layers/weight_init.py b/timm/models/quantization/layers/weight_init.py
new file mode 100644
index 00000000..d731029f
--- /dev/null
+++ b/timm/models/quantization/layers/weight_init.py
@@ -0,0 +1,60 @@
+import torch
+import math
+import warnings
+
+
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/timm/models/quantization/mobilenetv3.py b/timm/models/quantization/mobilenetv3.py
new file mode 100644
index 00000000..fb0dda1e
--- /dev/null
+++ b/timm/models/quantization/mobilenetv3.py
@@ -0,0 +1,462 @@
+
+""" MobileNet V3
+
+A PyTorch impl of MobileNet-V3, compatible with TF weights from official impl.
+
+Paper: Searching for MobileNetV3 - https://arxiv.org/abs/1905.02244
+
+Hacked together by / Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from typing import List
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD, IMAGENET_INCEPTION_MEAN, IMAGENET_INCEPTION_STD
+from .efficientnet_blocks import round_channels, resolve_bn_args, resolve_act_layer, BN_EPS_TF_DEFAULT
+from .efficientnet_builder import EfficientNetBuilder, decode_arch_def, efficientnet_init_weights
+from timm.models.features import FeatureInfo, FeatureHooks
+from timm.models.helpers import build_model_with_cfg
+from .layers import SelectAdaptivePool2d, create_conv2d, HardSigmoid
+from timm.models.registry import register_model
+from .efficientnet_blocks import ConvBnAct, SqueezeExcite, DepthwiseSeparableConv, InvertedResidual
+__all__ = ['MobileNetV3']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (1, 1),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'conv_stem', 'classifier': 'classifier',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'mobilenetv3_large_075': _cfg(url=''),
+    'mobilenetv3_large_100': _cfg(
+        interpolation='bicubic',
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_large_100_ra-f55367f5.pth'),
+    'mobilenetv3_small_075': _cfg(url=''),
+    'mobilenetv3_small_100': _cfg(url=''),
+    'mobilenetv3_rw': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/mobilenetv3_100-35495452.pth',
+        interpolation='bicubic'),
+    'tf_mobilenetv3_large_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_075-150ee8b0.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_large_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_100-427764d5.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_large_minimal_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_large_minimal_100-8596ae28.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_075': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_075-da427f52.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_100': _cfg(
+        url= 'https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_100-37f49e2b.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+    'tf_mobilenetv3_small_minimal_100': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-weights/tf_mobilenetv3_small_minimal_100-922a7843.pth',
+        mean=IMAGENET_INCEPTION_MEAN, std=IMAGENET_INCEPTION_STD),
+}
+
+_DEBUG = False
+
+
+class MobileNetV3(nn.Module):
+    """ MobiletNet-V3
+
+    Based on my EfficientNet implementation and building blocks, this model utilizes the MobileNet-v3 specific
+    'efficient head', where global pooling is done before the head convolution without a final batch-norm
+    layer before the classifier.
+
+    Paper: https://arxiv.org/abs/1905.02244
+    """
+
+    def __init__(self, block_args, num_classes=1000, in_chans=3, stem_size=16, num_features=1280, head_bias=True,
+                 channel_multiplier=1.0, pad_type='', act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0.,
+                 se_kwargs=None, norm_layer=nn.BatchNorm2d, norm_kwargs=None, global_pool='avg'):
+        super(MobileNetV3, self).__init__()
+        
+        #temporary fix to support tf_models with torch quantization
+        pad_type=''
+        
+        self.num_classes = num_classes
+        self.num_features = num_features
+        self.drop_rate = drop_rate
+
+        # Stem
+        stem_size = round_channels(stem_size, channel_multiplier)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            channel_multiplier, 8, None, 32, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_path_rate, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = builder.features
+        head_chs = builder.in_chs
+
+        # Head + Pooling
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        num_pooled_chs = head_chs * self.global_pool.feat_mult()
+        self.conv_head = create_conv2d(num_pooled_chs, self.num_features, 1, padding=pad_type, bias=head_bias)
+        self.act2 = act_layer(inplace=True)
+        self.classifier = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+        
+        # Quantization Stubs
+        self.quant = torch.quantization.QuantStub()
+        self.dequant = torch.quantization.DeQuantStub() 
+        
+        efficientnet_init_weights(self)
+
+    def as_sequential(self):
+        layers = [self.conv_stem, self.bn1, self.act1]
+        layers.extend(self.blocks)
+        layers.extend([self.global_pool, self.conv_head, self.act2])
+        layers.extend([nn.Flatten(), nn.Dropout(self.drop_rate), self.classifier])
+        return nn.Sequential(*layers)
+
+    def get_classifier(self):
+        return self.classifier
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        # cannot meaningfully change pooling of efficient head after creation
+        self.global_pool = SelectAdaptivePool2d(pool_type=global_pool)
+        self.classifier = nn.Linear(self.num_features, num_classes) if num_classes > 0 else nn.Identity()
+
+    def forward_features(self, x):
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        x = self.blocks(x)
+        x = self.global_pool(x)
+        x = self.conv_head(x)
+        x = self.act2(x)
+        return x
+
+    def forward(self, x):
+        x = self.quant(x)        
+        x = self.forward_features(x)
+        if not self.global_pool.is_identity():
+            x = x.flatten(1)
+        if self.drop_rate > 0.:
+            x = F.dropout(x, p=self.drop_rate, training=self.training)
+        x = self.classifier(x)
+        x = self.dequant(x)
+        return x
+    
+    def fuse_model(self):
+        modules_to_fuse = [['conv_stem','bn1']]
+        if type(self.act1) == nn.ReLU:
+            modules_to_fuse[0].append('act1')
+        if type(self.act2) == nn.ReLU:
+            modules_to_fuse.append(['conv_head','act2'])            
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)        
+        for m in self.modules():
+            if type(m) in [ConvBnAct, SqueezeExcite, DepthwiseSeparableConv, InvertedResidual]:
+                m.fuse_module()
+                
+class MobileNetV3Features(nn.Module):
+    """ MobileNetV3 Feature Extractor
+
+    A work-in-progress feature extraction module for MobileNet-V3 to use as a backbone for segmentation
+    and object detection models.
+    """
+
+    def __init__(self, block_args, out_indices=(0, 1, 2, 3, 4), feature_location='bottleneck',
+                 in_chans=3, stem_size=16, channel_multiplier=1.0, output_stride=32, pad_type='',
+                 act_layer=nn.ReLU, drop_rate=0., drop_path_rate=0., se_kwargs=None,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+        super(MobileNetV3Features, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        self.drop_rate = drop_rate
+
+        # Stem
+        stem_size = round_channels(stem_size, channel_multiplier)
+        self.conv_stem = create_conv2d(in_chans, stem_size, 3, stride=2, padding=pad_type)
+        self.bn1 = norm_layer(stem_size, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+
+        # Middle stages (IR/ER/DS Blocks)
+        builder = EfficientNetBuilder(
+            channel_multiplier, 8, None, output_stride, pad_type, act_layer, se_kwargs,
+            norm_layer, norm_kwargs, drop_path_rate, feature_location=feature_location, verbose=_DEBUG)
+        self.blocks = nn.Sequential(*builder(stem_size, block_args))
+        self.feature_info = FeatureInfo(builder.features, out_indices)
+        self._stage_out_idx = {v['stage']: i for i, v in enumerate(self.feature_info) if i in out_indices}
+
+        efficientnet_init_weights(self)
+
+        # Register feature extraction hooks with FeatureHooks helper
+        self.feature_hooks = None
+        if feature_location != 'bottleneck':
+            hooks = self.feature_info.get_dicts(keys=('module', 'hook_type'))
+            self.feature_hooks = FeatureHooks(hooks, self.named_modules())
+
+    def forward(self, x) -> List[torch.Tensor]:
+        x = self.conv_stem(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        if self.feature_hooks is None:
+            features = []
+            if 0 in self._stage_out_idx:
+                features.append(x)  # add stem out
+            for i, b in enumerate(self.blocks):
+                x = b(x)
+                if i + 1 in self._stage_out_idx:
+                    features.append(x)
+            return features
+        else:
+            self.blocks(x)
+            out = self.feature_hooks.get_output(x.device)
+            return list(out.values())
+
+
+def _create_mnv3(model_kwargs, variant, pretrained=False):
+    if model_kwargs.pop('features_only', False):
+        load_strict = False
+        model_kwargs.pop('num_classes', 0)
+        model_kwargs.pop('num_features', 0)
+        model_kwargs.pop('head_conv', None)
+        model_kwargs.pop('head_bias', None)
+        model_cls = MobileNetV3Features
+    else:
+        load_strict = True
+        model_cls = MobileNetV3
+    return build_model_with_cfg(
+        model_cls, variant, pretrained, default_cfg=default_cfgs[variant],
+        pretrained_strict=load_strict, **model_kwargs)
+
+
+def _gen_mobilenet_v3_rw(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MobileNet-V3 model.
+
+    Ref impl: ?
+    Paper: https://arxiv.org/abs/1905.02244
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    arch_def = [
+        # stage 0, 112x112 in
+        ['ds_r1_k3_s1_e1_c16_nre_noskip'],  # relu
+        # stage 1, 112x112 in
+        ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
+        # stage 2, 56x56 in
+        ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
+        # stage 3, 28x28 in
+        ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
+        # stage 4, 14x14in
+        ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
+        # stage 5, 14x14in
+        ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
+        # stage 6, 7x7 in
+        ['cn_r1_k1_s1_c960'],  # hard-swish
+    ]
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        head_bias=False,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=resolve_act_layer(kwargs, 'hard_swish'),
+        se_kwargs=dict(gate_fn=tf_mobilenetv3_large_100, reduce_mid=True, divisor=1),
+        **kwargs,
+    )
+    model = _create_mnv3(model_kwargs, variant, pretrained)
+    return model
+
+
+def _gen_mobilenet_v3(variant, channel_multiplier=1.0, pretrained=False, **kwargs):
+    """Creates a MobileNet-V3 model.
+
+    Ref impl: ?
+    Paper: https://arxiv.org/abs/1905.02244
+
+    Args:
+      channel_multiplier: multiplier to number of channels per layer.
+    """
+    if 'small' in variant:
+        num_features = 1024
+        if 'minimal' in variant:
+            act_layer = resolve_act_layer(kwargs, 'relu')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s2_e1_c16'],
+                # stage 1, 56x56 in
+                ['ir_r1_k3_s2_e4.5_c24', 'ir_r1_k3_s1_e3.67_c24'],
+                # stage 2, 28x28 in
+                ['ir_r1_k3_s2_e4_c40', 'ir_r2_k3_s1_e6_c40'],
+                # stage 3, 14x14 in
+                ['ir_r2_k3_s1_e3_c48'],
+                # stage 4, 14x14in
+                ['ir_r3_k3_s2_e6_c96'],
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c576'],
+            ]
+        else:
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s2_e1_c16_se0.25_nre'],  # relu
+                # stage 1, 56x56 in
+                ['ir_r1_k3_s2_e4.5_c24_nre', 'ir_r1_k3_s1_e3.67_c24_nre'],  # relu
+                # stage 2, 28x28 in
+                ['ir_r1_k5_s2_e4_c40_se0.25', 'ir_r2_k5_s1_e6_c40_se0.25'],  # hard-swish
+                # stage 3, 14x14 in
+                ['ir_r2_k5_s1_e3_c48_se0.25'],  # hard-swish
+                # stage 4, 14x14in
+                ['ir_r3_k5_s2_e6_c96_se0.25'],  # hard-swish
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c576'],  # hard-swish
+            ]
+    else:
+        num_features = 1280
+        if 'minimal' in variant:
+            act_layer = resolve_act_layer(kwargs, 'relu')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s1_e1_c16'],
+                # stage 1, 112x112 in
+                ['ir_r1_k3_s2_e4_c24', 'ir_r1_k3_s1_e3_c24'],
+                # stage 2, 56x56 in
+                ['ir_r3_k3_s2_e3_c40'],
+                # stage 3, 28x28 in
+                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],
+                # stage 4, 14x14in
+                ['ir_r2_k3_s1_e6_c112'],
+                # stage 5, 14x14in
+                ['ir_r3_k3_s2_e6_c160'],
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c960'],
+            ]
+        else:
+            act_layer = resolve_act_layer(kwargs, 'hard_swish')
+            arch_def = [
+                # stage 0, 112x112 in
+                ['ds_r1_k3_s1_e1_c16_nre'],  # relu
+                # stage 1, 112x112 in
+                ['ir_r1_k3_s2_e4_c24_nre', 'ir_r1_k3_s1_e3_c24_nre'],  # relu
+                # stage 2, 56x56 in
+                ['ir_r3_k5_s2_e3_c40_se0.25_nre'],  # relu
+                # stage 3, 28x28 in
+                ['ir_r1_k3_s2_e6_c80', 'ir_r1_k3_s1_e2.5_c80', 'ir_r2_k3_s1_e2.3_c80'],  # hard-swish
+                # stage 4, 14x14in
+                ['ir_r2_k3_s1_e6_c112_se0.25'],  # hard-swish
+                # stage 5, 14x14in
+                ['ir_r3_k5_s2_e6_c160_se0.25'],  # hard-swish
+                # stage 6, 7x7 in
+                ['cn_r1_k1_s1_c960'],  # hard-swish
+            ]
+
+    model_kwargs = dict(
+        block_args=decode_arch_def(arch_def),
+        num_features=num_features,
+        stem_size=16,
+        channel_multiplier=channel_multiplier,
+        norm_kwargs=resolve_bn_args(kwargs),
+        act_layer=act_layer,
+        se_kwargs=dict(act_layer=nn.ReLU, gate_fn=HardSigmoid, reduce_mid=True, divisor=8),
+        **kwargs,
+    )
+    model = _create_mnv3(model_kwargs, variant, pretrained)
+    return model
+
+
+@register_model
+def quant_mobilenetv3_large_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv3_large_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv3_small_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv3_small_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    model = _gen_mobilenet_v3('mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_mobilenetv3_rw(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    if pretrained:
+        # pretrained model trained with non-default BN epsilon
+        kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    model = _gen_mobilenet_v3_rw('mobilenetv3_rw', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mobilenetv3_large_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mobilenetv3_large_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mobilenetv3_large_minimal_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_large_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mobilenetv3_small_075(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_075', 0.75, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mobilenetv3_small_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
+
+
+@register_model
+def quant_tf_mobilenetv3_small_minimal_100(pretrained=False, **kwargs):
+    """ MobileNet V3 """
+    kwargs['bn_eps'] = BN_EPS_TF_DEFAULT
+    kwargs['pad_type'] = 'same'
+    model = _gen_mobilenet_v3('tf_mobilenetv3_small_minimal_100', 1.0, pretrained=pretrained, **kwargs)
+    return model
diff --git a/timm/models/quantization/rexnet.py b/timm/models/quantization/rexnet.py
new file mode 100644
index 00000000..1c5be131
--- /dev/null
+++ b/timm/models/quantization/rexnet.py
@@ -0,0 +1,323 @@
+""" ReXNet
+
+A PyTorch impl of `ReXNet: Diminishing Representational Bottleneck on Convolutional Neural Network` -
+https://arxiv.org/abs/2007.00992
+
+Adapted from original impl at https://github.com/clovaai/rexnet
+Copyright (c) 2020-present NAVER Corp. MIT license
+
+Changes for timm, feature extraction, and rounded channel variant hacked together by Ross Wightman
+Copyright 2020 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+from math import ceil
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from timm.models.helpers import build_model_with_cfg
+from .layers import ClassifierHead, create_act_layer, create_conv2d
+from timm.models.registry import register_model
+from .layers.activations import sigmoid, Swish, HardSwish, HardSigmoid
+
+def _cfg(url=''):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bicubic',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+    }
+
+
+default_cfgs = dict(
+    rexnet_100=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_100-1b4dddf4.pth'),
+    rexnet_130=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_130-590d768e.pth'),
+    rexnet_150=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_150-bd1a6aa8.pth'),
+    rexnet_200=_cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-rexnet/rexnetv1_200-8c0b7f2d.pth'),
+    rexnetr_100=_cfg(
+        url=''),
+    rexnetr_130=_cfg(
+        url=''),
+    rexnetr_150=_cfg(
+        url=''),
+    rexnetr_200=_cfg(
+        url=''),
+)
+
+
+def make_divisible(v, divisor=8, min_value=None):
+    min_value = min_value or divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    return new_v
+
+class ConvBn(nn.Module):
+    def __init__(self, in_chs, out_chs, kernel_size,
+                 stride=1, dilation=1, pad_type='', 
+                 norm_layer=nn.BatchNorm2d, groups = 1,norm_kwargs=None):
+        super(ConvBn, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, groups=groups,padding=pad_type)
+        self.bn1 = norm_layer(out_chs, **norm_kwargs)
+
+    def feature_info(self, location):
+        if location == 'expansion':  # output of conv after act, same as block coutput
+            info = dict(module='act1', hook_type='forward', num_chs=self.conv.out_channels)
+        else:  # location == 'bottleneck', block output
+            info = dict(module='', hook_type='', num_chs=self.conv.out_channels)
+        return info
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn1(x)
+        return x
+    
+    def fuse_module(self):
+        modules_to_fuse = ['conv','bn1']         
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)
+        
+class ConvBnAct(nn.Module):
+    def __init__(self, in_chs, out_chs, kernel_size,
+                 stride=1, dilation=1, pad_type='', act_layer=nn.ReLU,
+                 norm_layer=nn.BatchNorm2d, norm_kwargs=None):
+        super(ConvBnAct, self).__init__()
+        norm_kwargs = norm_kwargs or {}
+        self.conv = create_conv2d(in_chs, out_chs, kernel_size, stride=stride, dilation=dilation, padding=pad_type)
+        self.bn1 = norm_layer(out_chs, **norm_kwargs)
+        self.act1 = act_layer(inplace=True)
+        self.out_channels = out_chs
+    def feature_info(self, location):
+        if location == 'expansion':  # output of conv after act, same as block coutput
+            info = dict(module='act1', hook_type='forward', num_chs=self.conv.out_channels)
+        else:  # location == 'bottleneck', block output
+            info = dict(module='', hook_type='', num_chs=self.conv.out_channels)
+        return info
+
+    def forward(self, x):
+        x = self.conv(x)
+        x = self.bn1(x)
+        x = self.act1(x)
+        return x
+    def fuse_module(self):
+        modules_to_fuse = ['conv','bn1']   
+        if type(self.act1) == nn.ReLU:
+            modules_to_fuse.append('act1')        
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)        
+        
+class SEWithNorm(nn.Module):
+
+    def __init__(self, channels, reduction=16, act_layer=nn.ReLU, divisor=1, reduction_channels=None,
+                 gate_layer='sigmoid'):
+        super(SEWithNorm, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        reduction_channels = reduction_channels or make_divisible(channels // reduction, divisor=divisor)
+        self.fc1 = nn.Conv2d(
+            channels, reduction_channels, kernel_size=1, padding=0, bias=True)
+        self.bn = nn.BatchNorm2d(reduction_channels)
+        self.act = act_layer(inplace=True)
+        self.fc2 = nn.Conv2d(
+            reduction_channels, channels, kernel_size=1, padding=0, bias=True)
+        self.gate = create_act_layer(gate_layer)
+        self.quant_mul = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        x_se = self.avg_pool(x)
+        x_se = self.fc1(x_se)
+        x_se = self.bn(x_se)
+        x_se = self.act(x_se)
+        x_se = self.fc2(x_se)
+        return self.quant_mul.mul(x, self.gate(x_se))
+    
+    def fuse_module(self):        
+        modules_to_fuse = ['fc1','bn','act']       
+        torch.quantization.fuse_modules(self, modules_to_fuse, inplace=True)
+        
+
+class LinearBottleneck(nn.Module):
+    def __init__(self, in_chs, out_chs, stride, exp_ratio=1.0, use_se=True, se_rd=12, ch_div=1):
+        super(LinearBottleneck, self).__init__()
+        self.use_shortcut = stride == 1 and in_chs <= out_chs
+        self.in_channels = in_chs
+        self.out_channels = out_chs
+
+        if exp_ratio != 1.:
+            dw_chs = make_divisible(round(in_chs * exp_ratio), divisor=ch_div)
+            self.conv_exp = ConvBnAct(in_chs, dw_chs,1, act_layer=Swish)
+        else:
+            dw_chs = in_chs
+            self.conv_exp = None
+
+        self.conv_dw = ConvBn(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs)
+        self.se = SEWithNorm(dw_chs, reduction=se_rd, divisor=ch_div) if use_se else None
+        self.act_dw = nn.ReLU6()
+
+        self.conv_pwl = ConvBn(dw_chs, out_chs, 1)
+        if self.use_shortcut:        
+            self.skip_add = nn.quantized.FloatFunctional()
+        
+    def feat_channels(self, exp=False):
+        return self.conv_dw.out_channels if exp else self.out_channels
+
+    def forward(self, x):
+        shortcut = x
+        if self.conv_exp is not None:
+            x = self.conv_exp(x)
+        x = self.conv_dw(x)
+        if self.se is not None:
+            x = self.se(x)
+        x = self.act_dw(x)
+        x = self.conv_pwl(x)
+        if self.use_shortcut:
+            x[:, 0:self.in_channels]= self.skip_add.add(x[:, 0:self.in_channels], shortcut)
+        return x
+
+def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, use_se=True, ch_div=1):
+    layers = [1, 2, 2, 3, 3, 5]
+    strides = [1, 2, 2, 2, 1, 2]
+    layers = [ceil(element * depth_mult) for element in layers]
+    strides = sum([[element] + [1] * (layers[idx] - 1) for idx, element in enumerate(strides)], [])
+    exp_ratios = [1] * layers[0] + [6] * sum(layers[1:])
+    depth = sum(layers[:]) * 3
+    base_chs = initial_chs / width_mult if width_mult < 1.0 else initial_chs
+
+    # The following channel configuration is a simple instance to make each layer become an expand layer.
+    out_chs_list = []
+    for i in range(depth // 3):
+        out_chs_list.append(make_divisible(round(base_chs * width_mult), divisor=ch_div))
+        base_chs += final_chs / (depth // 3 * 1.0)
+
+    if use_se:
+        use_ses = [False] * (layers[0] + layers[1]) + [True] * sum(layers[2:])
+    else:
+        use_ses = [False] * sum(layers[:])
+
+    return zip(out_chs_list, exp_ratios, strides, use_ses)
+
+
+def _build_blocks(block_cfg, prev_chs, width_mult, se_rd=12, ch_div=1, feature_location='bottleneck'):
+    feat_exp = feature_location == 'expansion'
+    feat_chs = [prev_chs]
+    feature_info = []
+    curr_stride = 2
+    features = []
+    for block_idx, (chs, exp_ratio, stride, se) in enumerate(block_cfg):
+        if stride > 1:
+            fname = 'stem' if block_idx == 0 else f'features.{block_idx - 1}'
+            if block_idx > 0 and feat_exp:
+                fname += '.act_dw'
+            feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=fname)]
+            curr_stride *= stride
+        features.append(LinearBottleneck(
+            in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, use_se=se, se_rd=se_rd, ch_div=ch_div))
+        prev_chs = chs
+        feat_chs += [features[-1].feat_channels(feat_exp)]
+    pen_chs = make_divisible(1280 * width_mult, divisor=ch_div)
+    feature_info += [dict(
+        num_chs=pen_chs if feat_exp else feat_chs[-1], reduction=curr_stride,
+        module=f'features.{len(features) - int(not feat_exp)}')]
+    features.append(ConvBnAct(prev_chs, pen_chs,1, act_layer=Swish))
+    return features, feature_info
+
+
+class ReXNetV1(nn.Module):
+    def __init__(self, in_chans=3, num_classes=1000, global_pool='avg', output_stride=32,
+                 initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, use_se=True,
+                 se_rd=12, ch_div=1, drop_rate=0.2, feature_location='bottleneck'):
+        super(ReXNetV1, self).__init__()
+        self.drop_rate = drop_rate
+
+        assert output_stride == 32  # FIXME support dilation
+        stem_base_chs = 32 / width_mult if width_mult < 1.0 else 32
+        stem_chs = make_divisible(round(stem_base_chs * width_mult), divisor=ch_div)
+        self.stem = ConvBnAct(in_chans, stem_chs, 3, stride=2, act_layer=Swish)
+
+        block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, use_se, ch_div)
+        features, self.feature_info = _build_blocks(
+            block_cfg, stem_chs, width_mult, se_rd, ch_div, feature_location)
+        self.num_features = features[-1].out_channels
+        self.features = nn.Sequential(*features)
+
+        self.head = ClassifierHead(self.num_features, num_classes, global_pool, drop_rate)
+        
+        # Quantization Stubs
+        self.quant = torch.quantization.QuantStub()
+        self.dequant = torch.quantization.DeQuantStub() 
+        # FIXME weight init, the original appears to use PyTorch defaults
+
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.features(x)
+        return x
+
+    def forward(self, x):
+        x = self.quant(x)
+        x = self.forward_features(x)
+        x = self.head(x)
+        x = self.dequant(x)
+        return x
+
+    def fuse_model(self):           
+        for m in self.modules():
+            if type(m) in [ConvBnAct, ConvBn, SEWithNorm]:
+                m.fuse_module()
+def _create_rexnet(variant, pretrained, **kwargs):
+    feature_cfg = dict(flatten_sequential=True)
+    if kwargs.get('feature_location', '') == 'expansion':
+        feature_cfg['feature_cls'] = 'hook'
+    return build_model_with_cfg(
+        ReXNetV1, variant, pretrained, default_cfg=default_cfgs[variant], feature_cfg=feature_cfg, **kwargs)
+
+
+@register_model
+def quant_rexnet_100(pretrained=False, **kwargs):
+    """ReXNet V1 1.0x"""
+    return _create_rexnet('rexnet_100', pretrained, **kwargs)
+
+
+@register_model
+def quant_rexnet_130(pretrained=False, **kwargs):
+    """ReXNet V1 1.3x"""
+    return _create_rexnet('rexnet_130', pretrained, width_mult=1.3, **kwargs)
+
+
+@register_model
+def quant_rexnet_150(pretrained=False, **kwargs):
+    """ReXNet V1 1.5x"""
+    return _create_rexnet('rexnet_150', pretrained, width_mult=1.5, **kwargs)
+
+
+@register_model
+def quant_rexnet_200(pretrained=False, **kwargs):
+    """ReXNet V1 2.0x"""
+    return _create_rexnet('rexnet_200', pretrained, width_mult=2.0, **kwargs)
+
+
+@register_model
+def quant_rexnetr_100(pretrained=False, **kwargs):
+    """ReXNet V1 1.0x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_100', pretrained, ch_div=8, **kwargs)
+
+
+@register_model
+def quant_rexnetr_130(pretrained=False, **kwargs):
+    """ReXNet V1 1.3x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_130', pretrained, width_mult=1.3, ch_div=8, **kwargs)
+
+
+@register_model
+def quant_rexnetr_150(pretrained=False, **kwargs):
+    """ReXNet V1 1.5x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_150', pretrained, width_mult=1.5, ch_div=8, **kwargs)
+
+
+@register_model
+def quant_rexnetr_200(pretrained=False, **kwargs):
+    """ReXNet V1 2.0x w/ rounded (mod 8) channels"""
+    return _create_rexnet('rexnetr_200', pretrained, width_mult=2.0, ch_div=8, **kwargs)

From d1b2136669dd22d991c8e0ee4fd72b53c358e585 Mon Sep 17 00:00:00 2001
From: datamining108 <jamesk1228@gmail.com>
Date: Mon, 21 Sep 2020 20:55:21 +0900
Subject: [PATCH 2/4] remove unecessary imports

---
 timm/models/__init__.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/timm/models/__init__.py b/timm/models/__init__.py
index 0f747535..2b16f7b8 100644
--- a/timm/models/__init__.py
+++ b/timm/models/__init__.py
@@ -24,7 +24,6 @@ from .tresnet import *
 from .vovnet import *
 from .xception import *
 from .xception_aligned import *
-from .frostnet import *
 from .quantization.efficientnet import *
 from .quantization.mobilenetv3 import *
 from .quantization.rexnet import *

From 2b2e9c77fd5cbfe727a5b4c33681391e6a2100b2 Mon Sep 17 00:00:00 2001
From: "taehoon.kim" <jamesk1228@gmail.com>
Date: Mon, 22 Mar 2021 16:05:27 +0900
Subject: [PATCH 3/4] post quantization using graph_fx(beta)

---
 ...dation.py => post_quantization_validate.py | 304 +++++++++++-------
 1 file changed, 186 insertions(+), 118 deletions(-)
 rename post_quantization_validation.py => post_quantization_validate.py (57%)
 mode change 100644 => 100755

diff --git a/post_quantization_validation.py b/post_quantization_validate.py
old mode 100644
new mode 100755
similarity index 57%
rename from post_quantization_validation.py
rename to post_quantization_validate.py
index 6ad3ce2f..73c09dd4
--- a/post_quantization_validation.py
+++ b/post_quantization_validate.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 """ ImageNet Validation Script
 
 This is intended to be a lean and easily modifiable ImageNet validation script for evaluating pretrained
@@ -17,18 +17,29 @@ import torch
 import torch.nn as nn
 import torch.nn.parallel
 from collections import OrderedDict
-import torch.quantization
+from contextlib import suppress
 
-try:
-    from apex import amp
-    has_apex = True
-except ImportError:
-    has_apex = False
+import torch.quantization
+#currently, quantization only runs on CPUs
+os.environ['CUDA_VISIBLE_DEVICES'] = ""
 
 from timm.models import create_model, apply_test_time_pool, load_checkpoint, is_model, list_models
-from timm.data import Dataset, DatasetTar, resolve_data_config, RealLabelsImagenet
-from timm.data.quant_loader import create_loader
-from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging
+from timm.data import create_dataset, create_loader, resolve_data_config, RealLabelsImagenet
+from timm.utils import accuracy, AverageMeter, natural_key, setup_default_logging, set_jit_legacy
+
+#has_apex = False
+#try:
+#    from apex import amp
+#    has_apex = True
+#except ImportError:
+#    pass
+
+#as_native_amp = False
+#try:
+#    if getattr(torch.cuda.amp, 'autocast') is not None:
+#        has_native_amp = True
+#except AttributeError:
+#    pass
 
 torch.backends.cudnn.benchmark = True
 _logger = logging.getLogger('validate')
@@ -37,18 +48,26 @@ _logger = logging.getLogger('validate')
 parser = argparse.ArgumentParser(description='PyTorch ImageNet Validation')
 parser.add_argument('data', metavar='DIR',
                     help='path to dataset')
+parser.add_argument('--dataset', '-d', metavar='NAME', default='',
+                    help='dataset type (default: ImageFolder/ImageTar if empty)')
+#argument for calibration dataset                    
 parser.add_argument('--calib-data', metavar='DIR',
-                    help='path to calibration dataset')
-parser.add_argument('--model', '-m', metavar='MODEL', default='dpn92',
+                    help='path to calibration dataset')      
+# quantization option(weight only, dynamic, static)                    
+parser.add_argument('--quant_option', metavar='NAME', default='static',
+                    help='quantization option (weight_only, dynamic, static) (default: static)')                                  
+parser.add_argument('--split', metavar='NAME', default='validation',
+                    help='dataset split (default: validation)')
+parser.add_argument('--model', '-m', metavar='NAME', default='dpn92',
                     help='model architecture (default: dpn92)')
 parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
                     help='number of data loading workers (default: 2)')
 parser.add_argument('-b', '--batch-size', default=256, type=int,
                     metavar='N', help='mini-batch size (default: 256)')
-parser.add_argument('--calib-iter', default=100, type=int,
-                    metavar='N', help='Train set iterations for calibration before quantization')
 parser.add_argument('--img-size', default=None, type=int,
                     metavar='N', help='Input image dimension, uses model default if empty')
+parser.add_argument('--input-size', default=None, nargs=3, type=int,
+                    metavar='N N N', help='Input all image dimensions (d h w, e.g. --input-size 3 224 224), uses model default if empty')
 parser.add_argument('--crop-pct', default=None, type=float,
                     metavar='N', help='Input image center crop pct')
 parser.add_argument('--mean', type=float, nargs='+', default=None, metavar='MEAN',
@@ -57,26 +76,37 @@ parser.add_argument('--std', type=float,  nargs='+', default=None, metavar='STD'
                     help='Override std deviation of of dataset')
 parser.add_argument('--interpolation', default='', type=str, metavar='NAME',
                     help='Image resize interpolation type (overrides model)')
-parser.add_argument('--num-classes', type=int, default=1000,
+parser.add_argument('--num-classes', type=int, default=None,
                     help='Number classes in dataset')
 parser.add_argument('--class-map', default='', type=str, metavar='FILENAME',
                     help='path to class to idx mapping file (default: "")')
+parser.add_argument('--gp', default=None, type=str, metavar='POOL',
+                    help='Global pool type, one of (fast, avg, max, avgmax, avgmaxc). Model default if None.')
 parser.add_argument('--log-freq', default=10, type=int,
                     metavar='N', help='batch logging frequency (default: 10)')
 parser.add_argument('--checkpoint', default='', type=str, metavar='PATH',
                     help='path to latest checkpoint (default: none)')
 parser.add_argument('--pretrained', dest='pretrained', action='store_true',
                     help='use pre-trained model')
-parser.add_argument('--num-gpu', type=int, default=1,
-                    help='Number of GPUS to use')
+#parser.add_argument('--num-gpu', type=int, default=1,
+#                    help='Number of GPUS to use')
+#num-gpu is set to zero(no gpu usage)
+parser.add_argument('--num-gpu', type=int, default=0,
+                    help='Number of GPUS to use')                    
 parser.add_argument('--no-test-pool', dest='no_test_pool', action='store_true',
                     help='disable test time pool')
 parser.add_argument('--no-prefetcher', action='store_true', default=False,
                     help='disable fast prefetcher')
 parser.add_argument('--pin-mem', action='store_true', default=False,
                     help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
-parser.add_argument('--amp', action='store_true', default=False,
-                    help='Use AMP mixed precision')
+parser.add_argument('--channels-last', action='store_true', default=False,
+                    help='Use channels_last memory layout')
+#parser.add_argument('--amp', action='store_true', default=False,
+#                    help='Use AMP mixed precision. Defaults to Apex, fallback to native Torch AMP.')
+#parser.add_argument('--apex-amp', action='store_true', default=False,
+#                    help='Use NVIDIA Apex AMP mixed precision')
+#parser.add_argument('--native-amp', action='store_true', default=False,
+#                    help='Use Native Torch AMP mixed precision')
 parser.add_argument('--tf-preprocessing', action='store_true', default=False,
                     help='Use Tensorflow preprocessing pipeline (require CPU TF installed')
 parser.add_argument('--use-ema', dest='use_ema', action='store_true',
@@ -93,24 +123,28 @@ parser.add_argument('--valid-labels', default='', type=str, metavar='FILENAME',
                     help='Valid label indices txt file for validation of partial label space')
 
 
-def set_jit_legacy():
-    """ Set JIT executor to legacy w/ support for op fusion
-    This is hopefully a temporary need in 1.5/1.5.1/1.6 to restore performance due to changes
-    in the JIT exectutor. These API are not supported so could change.
-    """
-    #
-    assert hasattr(torch._C, '_jit_set_profiling_executor'), "Old JIT behavior doesn't exist!"
-    torch._C._jit_set_profiling_executor(False)
-    torch._C._jit_set_profiling_mode(False)
-    torch._C._jit_override_can_fuse_on_gpu(True)
-    #torch._C._jit_set_texpr_fuser_enabled(True)
-
-
 
 def validate(args):
     # might as well try to validate something
     args.pretrained = args.pretrained or not args.checkpoint
     args.prefetcher = not args.no_prefetcher
+#    amp_autocast = suppress  # do nothing
+#   if args.amp:
+#        if has_native_amp:
+#            args.native_amp = True
+#        elif has_apex:
+#            args.apex_amp = True
+#        else:
+#            _logger.warning("Neither APEX or Native Torch AMP is available.")
+#    assert not args.apex_amp or not args.native_amp, "Only one AMP mode should be set."
+#    if args.native_amp:
+#        amp_autocast = torch.cuda.amp.autocast
+#        _logger.info('Validating in mixed precision with native PyTorch AMP.')
+#   elif args.apex_amp:
+#        _logger.info('Validating in mixed precision with NVIDIA APEX AMP.')
+#    else:
+#        _logger.info('Validating in float32. AMP not enabled.')
+
     if args.legacy_jit:
         set_jit_legacy()
 
@@ -120,30 +154,50 @@ def validate(args):
         pretrained=args.pretrained,
         num_classes=args.num_classes,
         in_chans=3,
+        global_pool=args.gp,
         scriptable=args.torchscript)
+    if args.num_classes is None:
+        assert hasattr(model, 'num_classes'), 'Model must have `num_classes` attr if not set on cmd line/config.'
+        args.num_classes = model.num_classes
 
     if args.checkpoint:
-        load_checkpoint(model, args.checkpoint, args.use_ema)  
+        load_checkpoint(model, args.checkpoint, args.use_ema)
 
     param_count = sum([m.numel() for m in model.parameters()])
     _logger.info('Model %s created, param count: %d' % (args.model, param_count))
 
-    data_config = resolve_data_config(vars(args), model=model)
-    model, test_time_pool = apply_test_time_pool(model, data_config, args)
+    data_config = resolve_data_config(vars(args), model=model, use_test_size=True)
+    test_time_pool = False
+    if not args.no_test_pool:
+        model, test_time_pool = apply_test_time_pool(model, data_config, use_test_size=True)
 
     if args.torchscript:
         torch.jit.optimized_execution(True)
         model = torch.jit.script(model)
 
+#    model = model.cuda()
+#    if args.apex_amp:
+#        model = amp.initialize(model, opt_level='O1')
+
+    if args.channels_last:
+        model = model.to(memory_format=torch.channels_last)
 
+#    if args.num_gpu > 1:
+#        model = torch.nn.DataParallel(model, device_ids=list(range(args.num_gpu)))
+
+#   criterion = nn.CrossEntropyLoss().cuda()
     criterion = nn.CrossEntropyLoss()
+    dataset = create_dataset(
+        root=args.data, name=args.dataset, split=args.split,
+        load_bytes=args.tf_preprocessing, class_map=args.class_map)
+
+    # added for post quantization calibration
+
+    calib_dataset = create_dataset(
+        root=args.data, name=args.dataset, split=args.split,
+        load_bytes=args.tf_preprocessing, class_map=args.class_map)
+        
 
-    if os.path.splitext(args.data)[1] == '.tar' and os.path.isfile(args.data):
-        dataset = DatasetTar(args.data, load_bytes=args.tf_preprocessing, class_map=args.class_map)
-        calib_dataset = DatasetTar(args.calib_data, load_bytes=args.tf_preprocessing, class_map=args.class_map)        
-    else:
-        dataset = Dataset(args.data, load_bytes=args.tf_preprocessing, class_map=args.class_map)
-        calib_dataset = Dataset(args.calib_data, load_bytes=args.tf_preprocessing, class_map=args.class_map)
     if args.valid_labels:
         with open(args.valid_labels, 'r') as f:
             valid_labels = {int(line.rstrip()) for line in f}
@@ -169,6 +223,8 @@ def validate(args):
         crop_pct=crop_pct,
         pin_memory=args.pin_mem,
         tf_preprocessing=args.tf_preprocessing)
+
+    #Also create loader for calibration dataset
     calib_loader = create_loader(
         calib_dataset,
         input_size=data_config['input_size'],
@@ -180,94 +236,104 @@ def validate(args):
         num_workers=args.workers,
         crop_pct=crop_pct,
         pin_memory=args.pin_mem,
-        tf_preprocessing=args.tf_preprocessing)    
+        tf_preprocessing=args.tf_preprocessing)
+
     batch_time = AverageMeter()
     losses = AverageMeter()
     top1 = AverageMeter()
     top5 = AverageMeter()
-    print('Start calibration of quantization observers before post-quantization')
-    model.eval()
-    model.fuse_model()
-    model.qconfig = torch.quantization.get_default_qconfig('fbgemm')
-    print(model.qconfig)
-    torch.quantization.prepare(model, inplace=True)
-    
-    with torch.no_grad():
-        # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
-        input = torch.randn((args.batch_size,) + data_config['input_size'])
-        model(input)
-        end = time.time()
-        for batch_idx, (input, target) in enumerate(calib_loader):
-            if batch_idx > args.calib_iter:
-                break
-            if args.no_prefetcher:
-                if args.fp16:
-                    input = input.half()
 
-            # compute output
-            output = model(input)
-            if valid_labels is not None:
-                output = output[:, valid_labels]
-            loss = criterion(output, target)
-
-            if real_labels is not None:
-                real_labels.add_result(output)
-
-            # measure accuracy and record loss
-            acc1, acc5 = accuracy(output.data, target, topk=(1, 5))
-            losses.update(loss.item(), input.size(0))
-            top1.update(acc1.item(), input.size(0))
-            top5.update(acc5.item(), input.size(0))
-
-            # measure elapsed time
-            batch_time.update(time.time() - end)
+    print('Start calibration of quantization observers before post-quantization')
+    model_to_quantize = copy.deepcopy(model)
+    model_to_quantize.eval()
+
+    #post training static quantization
+    if args.quant_option == 'static':
+        qconfig_dict = {"": torch.quantization.default_static_qconfig} 
+        model_to_quantize = copy.deepcopy(model_fp)
+        qconfig_dict = {"": torch.quantization.get_default_qconfig('qnnpack')}
+        model_to_quantize.eval()
+        # prepare
+        model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict)
+        # calibrate 
+        with torch.no_grad():
+            # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
+            input = torch.randn((args.batch_size,) + tuple(data_config['input_size'])) 
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
+            model(input)
             end = time.time()
-
-            if batch_idx % args.log_freq == 0:
-                _logger.info(
-                    'Test: [{0:>4d}/{1}]  '
-                    'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
-                    'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
-                    'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f})  '
-                    'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format(
-                        batch_idx, args.calib_iter, batch_time=batch_time,
-                        rate_avg=input.size(0) / batch_time.avg,
-                        loss=losses, top1=top1, top5=top5))
-    
-    if real_labels is not None:
-        # real labels mode replaces topk values at the end
-        top1a, top5a = real_labels.get_accuracy(k=1), real_labels.get_accuracy(k=5)
+            for batch_idx, (input, target) in enumerate(loader):
+
+                if args.channels_last:
+                    input = input.contiguous(memory_format=torch.channels_last)
+
+                if valid_labels is not None:
+                    output = output[:, valid_labels]
+                loss = criterion(output, target)
+
+                if real_labels is not None:
+                    real_labels.add_result(output)
+
+                # measure accuracy and record loss
+                acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5))
+                losses.update(loss.item(), input.size(0))
+                top1.update(acc1.item(), input.size(0))
+                top5.update(acc5.item(), input.size(0))
+
+                # measure elapsed time
+                batch_time.update(time.time() - end)
+                end = time.time()
+
+                if batch_idx % args.log_freq == 0:
+                    _logger.info(
+                        'Test: [{0:>4d}/{1}]  '
+                        'Time: {batch_time.val:.3f}s ({batch_time.avg:.3f}s, {rate_avg:>7.2f}/s)  '
+                        'Loss: {loss.val:>7.4f} ({loss.avg:>6.4f})  '
+                        'Acc@1: {top1.val:>7.3f} ({top1.avg:>7.3f})  '
+                        'Acc@5: {top5.val:>7.3f} ({top5.avg:>7.3f})'.format(
+                            batch_idx, len(loader), batch_time=batch_time,
+                            rate_avg=input.size(0) / batch_time.avg,
+                            loss=losses, top1=top1, top5=top5))        
+        # quantize
+        model_quantized = quantize_fx.convert_fx(model_prepared)           
+    #post training dynamic/weight only quantization    
+    elif args.quant_option == 'dynamic':    
+        qconfig_dict = {"": torch.quantization.default_dynamic_qconfig}
+        # prepare
+        model_prepared = quantize_fx.prepare_fx(model_to_quantize, qconfig_dict)
+        # no calibration needed when we only have dynamici/weight_only quantization
+        # quantize
+        model_quantized = quantize_fx.convert_fx(model_prepared)       
     else:
-        top1a, top5a = top1.avg, top5.avg
-    results = OrderedDict(
-        top1=round(top1a, 4), top1_err=round(100 - top1a, 4),
-        top5=round(top5a, 4), top5_err=round(100 - top5a, 4),
-        param_count=round(param_count / 1e6, 2),
-        img_size=data_config['input_size'][-1],
-        cropt_pct=crop_pct,
-        interpolation=data_config['interpolation'])
+        _logger.warning("Invalid quantization option. Set option to default(static)")
+    #
+    # fusion
+    #
+    model_to_quantize = copy.deepcopy(model_fp)
+    model_fused = quantize_fx.fuse_fx(model_to_quantize)   
+
+    model = model_fused
 
-    _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format(
-       results['top1'], results['top1_err'], results['top5'], results['top5_err']))
-    print('Start validation of post-quantized model')
-    torch.quantization.convert(model.eval(),inplace = True)
-    
-    batch_time = AverageMeter()
-    losses = AverageMeter()
-    top1 = AverageMeter()
-    top5 = AverageMeter()    
     with torch.no_grad():
         # warmup, reduce variability of first batch time, especially for comparing torchscript vs non
-        input = torch.randn((args.batch_size,) + data_config['input_size'])
+#        input = torch.randn((args.batch_size,) + tuple(data_config['input_size'])).cuda()
+        input = torch.randn((args.batch_size,) + tuple(data_config['input_size'])) 
+        if args.channels_last:
+            input = input.contiguous(memory_format=torch.channels_last)
         model(input)
         end = time.time()
         for batch_idx, (input, target) in enumerate(loader):
-            if args.no_prefetcher:
-                if args.fp16:
-                    input = input.half()
+ #           if args.no_prefetcher:
+ #               target = target.cuda()
+ #               input = input.cuda()
+            if args.channels_last:
+                input = input.contiguous(memory_format=torch.channels_last)
 
             # compute output
-            output = model(input)
+    #        with amp_autocast():
+    #            output = model(input)
+
             if valid_labels is not None:
                 output = output[:, valid_labels]
             loss = criterion(output, target)
@@ -276,7 +342,7 @@ def validate(args):
                 real_labels.add_result(output)
 
             # measure accuracy and record loss
-            acc1, acc5 = accuracy(output.data, target, topk=(1, 5))
+            acc1, acc5 = accuracy(output.detach(), target, topk=(1, 5))
             losses.update(loss.item(), input.size(0))
             top1.update(acc1.item(), input.size(0))
             top5.update(acc5.item(), input.size(0))
@@ -310,7 +376,8 @@ def validate(args):
         interpolation=data_config['interpolation'])
 
     _logger.info(' * Acc@1 {:.3f} ({:.3f}) Acc@5 {:.3f} ({:.3f})'.format(
-       results['top1'], results['top1_err'], results['top5'], results['top5_err']))    
+       results['top1'], results['top1_err'], results['top5'], results['top5_err']))
+
     return results
 
 
@@ -329,7 +396,7 @@ def main():
         if args.model == 'all':
             # validate all models in a list of names with pretrained checkpoints
             args.pretrained = True
-            model_names = list_models(pretrained=True)
+            model_names = list_models(pretrained=True, exclude_filters=['*in21k'])
             model_cfgs = [(n, '') for n in model_names]
         elif not is_model(args.model):
             # model name doesn't exist, try as wildcard filter
@@ -349,7 +416,8 @@ def main():
                 result = OrderedDict(model=args.model)
                 r = {}
                 while not r and batch_size >= args.num_gpu:
-                    torch.cuda.empty_cache()
+#                    torch.cuda.empty_cache()
+                    torch.empty_cache()
                     try:
                         args.batch_size = batch_size
                         print('Validating with batch size: %d' % args.batch_size)

From bc25e17bed586148ff43008d0e2f9017297b258e Mon Sep 17 00:00:00 2001
From: "taehoon.kim" <jamesk1228@gmail.com>
Date: Mon, 22 Mar 2021 16:15:12 +0900
Subject: [PATCH 4/4] post quantization using graph_fx(beta)

---
 post_quantization_validate.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/post_quantization_validate.py b/post_quantization_validate.py
index 73c09dd4..f026d5ab 100755
--- a/post_quantization_validate.py
+++ b/post_quantization_validate.py
@@ -20,6 +20,8 @@ from collections import OrderedDict
 from contextlib import suppress
 
 import torch.quantization
+import torch.quantization.quantize_fx as quantize_fx
+import copy
 #currently, quantization only runs on CPUs
 os.environ['CUDA_VISIBLE_DEVICES'] = ""