From dc85e5a237753b149f72caa4b9c351f46d257200 Mon Sep 17 00:00:00 2001
From: Ross Wightman <rwightman@gmail.com>
Date: Tue, 9 Feb 2021 16:21:47 -0800
Subject: [PATCH] Add ByobNet w/ GPU-EfficientNets and RepVGG. Also add classic
 vgg models.

---
 timm/models/__init__.py |   2 +
 timm/models/byobnet.py  | 707 ++++++++++++++++++++++++++++++++++++++++
 timm/models/nfnet.py    |   7 +-
 timm/models/vgg.py      | 261 +++++++++++++++
 4 files changed, 975 insertions(+), 2 deletions(-)
 create mode 100644 timm/models/byobnet.py
 create mode 100644 timm/models/vgg.py

diff --git a/timm/models/__init__.py b/timm/models/__init__.py
index a7e85084..dc56848e 100644
--- a/timm/models/__init__.py
+++ b/timm/models/__init__.py
@@ -1,3 +1,4 @@
+from .byobnet import *
 from .cspnet import *
 from .densenet import *
 from .dla import *
@@ -23,6 +24,7 @@ from .selecsls import *
 from .senet import *
 from .sknet import *
 from .tresnet import *
+from .vgg import *
 from .vision_transformer import *
 from .vovnet import *
 from .xception import *
diff --git a/timm/models/byobnet.py b/timm/models/byobnet.py
new file mode 100644
index 00000000..680edd22
--- /dev/null
+++ b/timm/models/byobnet.py
@@ -0,0 +1,707 @@
+""" Bring-Your-Own-Blocks Network
+
+A flexible network w/ dataclass based config for stacking those NN blocks.
+
+This model is currently used to implement the following networks:
+
+GPU Efficient (ResNets) - gernet_l/m/s (original versions called genet, but this was already used (by SENet author)).
+Paper: `Neural Architecture Design for GPU-Efficient Networks` - https://arxiv.org/abs/2006.14090
+Code and weights: https://github.com/idstcv/GPU-Efficient-Networks, licensed Apache 2.0
+
+RepVGG - repvgg_*
+Paper: `Making VGG-style ConvNets Great Again` - https://arxiv.org/abs/2101.03697
+Code and weights: https://github.com/DingXiaoH/RepVGG, licensed MIT
+
+In all cases the models have been modified to fit within the design of ByobNet. I've remapped
+the original weights and verified accuracies.
+
+For GPU Efficient nets, I used the original names for the blocks since they were for the most part
+the same as original residual blocks in ResNe(X)t, DarkNet, and other existing models. Note also some
+changes introduced in RegNet were also present in the stem and bottleneck blocks for this model.
+
+A significant number of different network archs can be implemented here, including variants of the
+above nets that include attention.
+
+Hacked together by / copyright Ross Wightman, 2021.
+"""
+import math
+from dataclasses import dataclass, field
+from collections import OrderedDict
+from typing import Tuple, Dict, Optional, Union, Any, Callable
+from functools import partial
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import ClassifierHead, ConvBnAct, DropPath, AvgPool2dSame, \
+    create_conv2d, get_act_layer, get_attn, convert_norm_act, make_divisible
+from .registry import register_model
+
+__all__ = ['ByobNet', 'ByobCfg', 'BlocksCfg']
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url, 'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (7, 7),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'stem.conv', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    # GPU-Efficient (ResNet) weights
+    'gernet_s': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_s-756b4751.pth'),
+    'gernet_m': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_m-0873c53a.pth'),
+    'gernet_l': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-ger-weights/gernet_l-f31e2e8d.pth',
+        input_size=(3, 256, 256), pool_size=(8, 8)),
+
+    # RepVGG weights
+    'repvgg_a2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_a2-c1ee6d2b.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b0': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b0-80ac3f1b.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b1': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1-77ca2989.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b1g4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b1g4-abde5d92.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b2': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2-25b7494e.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b2g4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b2g4-165a85f2.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b3': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3-199bc50d.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+    'repvgg_b3g4': _cfg(
+        url='https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-repvgg-weights/repvgg_b3g4-73c370bf.pth',
+        first_conv=('stem.conv_kxk.conv', 'stem.conv_1x1.conv')),
+}
+
+
+@dataclass
+class BlocksCfg:
+    type: Union[str, nn.Module]
+    d: int  # block depth (number of block repeats in stage)
+    c: int  # number of output channels for each block in stage
+    s: int = 2  # stride of stage (first block)
+    gs: Optional[Union[int, Callable]] = None  # group-size of blocks in stage, conv is depthwise if gs == 1
+    br: float = 1.  # bottleneck-ratio of blocks in stage
+
+
+@dataclass
+class ByobCfg:
+    blocks: Tuple[BlocksCfg, ...]
+    downsample: str = 'conv1x1'
+    stem_type: str = '3x3'
+    stem_chs: int = 32
+    width_factor: float = 1.0
+    num_features: int = 0  # num out_channels for final conv, no final 1x1 conv if 0
+    zero_init_last_bn: bool = True
+
+    act_layer: str = 'relu'
+    norm_layer: nn.Module = nn.BatchNorm2d
+    attn_layer: Optional[str] = None
+    attn_kwargs: dict = field(default_factory=lambda: dict())
+
+
+def _rep_vgg_bcfg(d=(4, 6, 16, 1), wf=(1., 1., 1., 1.), groups=0):
+    c = (64, 128, 256, 512)
+    group_size = 0
+    if groups > 0:
+        group_size = lambda chs, idx: chs // groups if (idx + 1) % 2 == 0 else 0
+    bcfg = tuple([BlocksCfg(type='rep', d=d, c=c * wf, gs=group_size) for d, c, wf in zip(d, c, wf)])
+    return bcfg
+
+
+model_cfgs = dict(
+
+    gernet_l=ByobCfg(
+        blocks=(
+            BlocksCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.),
+            BlocksCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.),
+            BlocksCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4),
+            BlocksCfg(type='bottle', d=5, c=640, s=2, gs=1, br=3.),
+            BlocksCfg(type='bottle', d=4, c=640, s=1, gs=1, br=3.),
+        ),
+        stem_chs=32,
+        num_features=2560,
+    ),
+    gernet_m=ByobCfg(
+        blocks=(
+            BlocksCfg(type='basic', d=1, c=128, s=2, gs=0, br=1.),
+            BlocksCfg(type='basic', d=2, c=192, s=2, gs=0, br=1.),
+            BlocksCfg(type='bottle', d=6, c=640, s=2, gs=0, br=1 / 4),
+            BlocksCfg(type='bottle', d=4, c=640, s=2, gs=1, br=3.),
+            BlocksCfg(type='bottle', d=1, c=640, s=1, gs=1, br=3.),
+        ),
+        stem_chs=32,
+        num_features=2560,
+    ),
+    gernet_s=ByobCfg(
+        blocks=(
+            BlocksCfg(type='basic', d=1, c=48, s=2, gs=0, br=1.),
+            BlocksCfg(type='basic', d=3, c=48, s=2, gs=0, br=1.),
+            BlocksCfg(type='bottle', d=7, c=384, s=2, gs=0, br=1 / 4),
+            BlocksCfg(type='bottle', d=2, c=560, s=2, gs=1, br=3.),
+            BlocksCfg(type='bottle', d=1, c=256, s=1, gs=1, br=3.),
+        ),
+        stem_chs=13,
+        num_features=1920,
+    ),
+
+    repvgg_a2=ByobCfg(
+        blocks=_rep_vgg_bcfg(d=(2, 4, 14, 1), wf=(1.5, 1.5, 1.5, 2.75)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b0=ByobCfg(
+        blocks=_rep_vgg_bcfg(wf=(1., 1., 1., 2.5)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b1=ByobCfg(
+        blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b1g4=ByobCfg(
+        blocks=_rep_vgg_bcfg(wf=(2., 2., 2., 4.), groups=4),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b2=ByobCfg(
+        blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b2g4=ByobCfg(
+        blocks=_rep_vgg_bcfg(wf=(2.5, 2.5, 2.5, 5.), groups=4),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b3=ByobCfg(
+        blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.)),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+    repvgg_b3g4=ByobCfg(
+        blocks=_rep_vgg_bcfg(wf=(3., 3., 3., 5.), groups=4),
+        stem_type='rep',
+        stem_chs=64,
+    ),
+)
+
+
+def _na_args(cfg: dict):
+    return dict(
+        norm_layer=cfg.get('norm_layer', nn.BatchNorm2d),
+        act_layer=cfg.get('act_layer', nn.ReLU))
+
+
+def _ex_tuple(cfg: dict, *names):
+    return tuple([cfg.get(n, None) for n in names])
+
+
+def num_groups(group_size, channels):
+    if not group_size:  # 0 or None
+        return 1  # normal conv with 1 group
+    else:
+        # NOTE group_size == 1 -> depthwise conv
+        assert channels % group_size == 0
+        return channels // group_size
+
+
+class DownsampleAvg(nn.Module):
+    def __init__(self, in_chs, out_chs, stride=1, dilation=1, apply_act=False, norm_layer=None, act_layer=None):
+        """ AvgPool Downsampling as in 'D' ResNet variants."""
+        super(DownsampleAvg, self).__init__()
+        avg_stride = stride if dilation == 1 else 1
+        if stride > 1 or dilation > 1:
+            avg_pool_fn = AvgPool2dSame if avg_stride == 1 and dilation > 1 else nn.AvgPool2d
+            self.pool = avg_pool_fn(2, avg_stride, ceil_mode=True, count_include_pad=False)
+        else:
+            self.pool = nn.Identity()
+        self.conv = ConvBnAct(in_chs, out_chs, 1, apply_act=apply_act, norm_layer=norm_layer, act_layer=act_layer)
+
+    def forward(self, x):
+        return self.conv(self.pool(x))
+
+
+def create_downsample(type, **kwargs):
+    if type == 'avg':
+        return DownsampleAvg(**kwargs)
+    else:
+        return ConvBnAct(kwargs.pop('in_chs'), kwargs.pop('out_chs'), kernel_size=1, **kwargs)
+
+
+class BasicBlock(nn.Module):
+    """ ResNet Basic Block - kxk + kxk
+    """
+
+    def __init__(
+            self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), group_size=None, bottle_ratio=1.0,
+            downsample='avg', linear_out=False, layer_cfg=None, drop_block=None, drop_path_rate=0.):
+        super(BasicBlock, self).__init__()
+        layer_cfg = layer_cfg or {}
+        act_layer, attn_layer = _ex_tuple(layer_cfg, 'act_layer', 'attn_layer')
+        layer_args = _na_args(layer_cfg)
+        mid_chs = make_divisible(out_chs * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = create_downsample(
+                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
+                apply_act=False, **layer_args)
+        else:
+            self.shortcut = nn.Identity()
+
+        self.conv1_kxk = ConvBnAct(in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0], **layer_args)
+        self.conv2_kxk = ConvBnAct(
+            mid_chs, out_chs, kernel_size, dilation=dilation[1], groups=groups,
+            drop_block=drop_block, apply_act=False, **layer_args)
+        self.attn = nn.Identity() if attn_layer is None else attn_layer(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else act_layer(inplace=True)
+
+    def init_weights(self, zero_init_last_bn=False):
+        if zero_init_last_bn:
+            nn.init.zeros_(self.conv2_kxk.bn.weight)
+
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+
+        # residual path
+        x = self.conv1_kxk(x)
+        x = self.conv2_kxk(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+
+        x = self.act(x + shortcut)
+        return x
+
+
+class BottleneckBlock(nn.Module):
+    """ ResNet-like Bottleneck Block - 1x1 - kxk - 1x1
+    """
+
+    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1., group_size=None,
+                 downsample='avg', linear_out=False, layer_cfg=None, drop_block=None, drop_path_rate=0.):
+        super(BottleneckBlock, self).__init__()
+        layer_cfg = layer_cfg or {}
+        act_layer, attn_layer = _ex_tuple(layer_cfg, 'act_layer', 'attn_layer')
+        layer_args = _na_args(layer_cfg)
+        mid_chs = make_divisible(out_chs * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = create_downsample(
+                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
+                apply_act=False, **layer_args)
+        else:
+            self.shortcut = nn.Identity()
+
+        self.conv1_1x1 = ConvBnAct(in_chs, mid_chs, 1, **layer_args)
+        self.conv2_kxk = ConvBnAct(
+            mid_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
+            groups=groups, drop_block=drop_block, **layer_args)
+        self.attn = nn.Identity() if attn_layer is None else attn_layer(mid_chs)
+        self.conv3_1x1 = ConvBnAct(mid_chs, out_chs, 1, apply_act=False, **layer_args)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else act_layer(inplace=True)
+
+    def init_weights(self, zero_init_last_bn=False):
+        if zero_init_last_bn:
+            nn.init.zeros_(self.conv3_1x1.bn.weight)
+
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+
+        x = self.conv1_1x1(x)
+        x = self.conv2_kxk(x)
+        x = self.attn(x)
+        x = self.conv3_1x1(x)
+        x = self.drop_path(x)
+
+        x = self.act(x + shortcut)
+        return x
+
+
+class DarkBlock(nn.Module):
+    """ DarkNet-like (1x1 + 3x3 w/ stride) block
+
+    The GE-Net impl included a 1x1 + 3x3 block in their search space. It was not used in the feature models.
+    This block is pretty much a DarkNet block (also DenseNet) hence the name. Neither DarkNet or DenseNet
+    uses strides within the block (external 3x3 or maxpool downsampling is done in front of the block repeats).
+
+    If one does want to use a lot of these blocks w/ stride, I'd recommend using the EdgeBlock (3x3 /w stride + 1x1)
+    for more optimal compute.
+    """
+
+    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
+                 downsample='avg', linear_out=False, layer_cfg=None, drop_block=None, drop_path_rate=0.):
+        super(DarkBlock, self).__init__()
+        layer_cfg = layer_cfg or {}
+        act_layer, attn_layer = _ex_tuple(layer_cfg, 'act_layer', 'attn_layer')
+        layer_args = _na_args(layer_cfg)
+        mid_chs = make_divisible(out_chs * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = create_downsample(
+                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
+                apply_act=False, **layer_args)
+        else:
+            self.shortcut = nn.Identity()
+
+        self.conv1_1x1 = ConvBnAct(in_chs, mid_chs, 1, **layer_args)
+        self.conv2_kxk = ConvBnAct(
+            mid_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
+            groups=groups,  drop_block=drop_block, apply_act=False, **layer_args)
+        self.attn = nn.Identity() if attn_layer is None else attn_layer(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else act_layer(inplace=True)
+
+    def init_weights(self, zero_init_last_bn=False):
+        if zero_init_last_bn:
+            nn.init.zeros_(self.conv2_kxk.bn.weight)
+
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+
+        x = self.conv1_1x1(x)
+        x = self.conv2_kxk(x)
+        x = self.attn(x)
+        x = self.drop_path(x)
+        x = self.act(x + shortcut)
+        return x
+
+
+class EdgeBlock(nn.Module):
+    """ EdgeResidual-like (3x3 + 1x1) block
+
+    A two layer block like DarkBlock, but with the order of the 3x3 and 1x1 convs reversed.
+    Very similar to the EfficientNet Edge-Residual block but this block it ends with activations, is
+    intended to be used with either expansion or bottleneck contraction, and can use DW/group/non-grouped convs.
+
+    FIXME is there a more common 3x3 + 1x1 conv block to name this after?
+    """
+
+    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
+                 downsample='avg', linear_out=False, layer_cfg=None, drop_block=None, drop_path_rate=0.):
+        super(EdgeBlock, self).__init__()
+        layer_cfg = layer_cfg or {}
+        act_layer, attn_layer = _ex_tuple(layer_cfg, 'act_layer', 'attn_layer')
+        layer_args = _na_args(layer_cfg)
+        mid_chs = make_divisible(out_chs * bottle_ratio)
+        groups = num_groups(group_size, mid_chs)
+
+        if in_chs != out_chs or stride != 1 or dilation[0] != dilation[1]:
+            self.shortcut = create_downsample(
+                downsample, in_chs=in_chs, out_chs=out_chs, stride=stride, dilation=dilation[0],
+                apply_act=False, **layer_args)
+        else:
+            self.shortcut = nn.Identity()
+
+        self.conv1_kxk = ConvBnAct(
+            in_chs, mid_chs, kernel_size, stride=stride, dilation=dilation[0],
+            groups=groups,  drop_block=drop_block, **layer_args)
+        self.attn = nn.Identity() if attn_layer is None else attn_layer(out_chs)
+        self.conv2_1x1 = ConvBnAct(mid_chs, out_chs, 1, apply_act=False, **layer_args)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = nn.Identity() if linear_out else act_layer(inplace=True)
+
+    def init_weights(self, zero_init_last_bn=False):
+        if zero_init_last_bn:
+            nn.init.zeros_(self.conv2_1x1.bn.weight)
+
+    def forward(self, x):
+        shortcut = self.shortcut(x)
+
+        x = self.conv1_kxk(x)
+        x = self.attn(x)
+        x = self.conv2_1x1(x)
+        x = self.drop_path(x)
+        x = self.act(x + shortcut)
+        return x
+
+
+class RepVggBlock(nn.Module):
+    """ RepVGG Block.
+
+    Adapted from impl at https://github.com/DingXiaoH/RepVGG
+
+    This version does not currently support the deploy optimization. It is currently fixed in 'train' model.
+    """
+
+    def __init__(self, in_chs, out_chs, kernel_size=3, stride=1, dilation=(1, 1), bottle_ratio=1.0, group_size=None,
+                 downsample='', layer_cfg=None, drop_block=None, drop_path_rate=0.):
+        super(RepVggBlock, self).__init__()
+        layer_cfg = layer_cfg or {}
+        act_layer, norm_layer, attn_layer = _ex_tuple(layer_cfg, 'act_layer', 'norm_layer', 'attn_layer')
+        norm_layer = convert_norm_act(norm_layer=norm_layer, act_layer=act_layer)
+        layer_args = _na_args(layer_cfg)
+        groups = num_groups(group_size, in_chs)
+
+        use_ident = in_chs == out_chs and stride == 1 and dilation[0] == dilation[1]
+        self.identity = norm_layer(out_chs, apply_act=False) if use_ident else None
+        self.conv_kxk = ConvBnAct(
+            in_chs, out_chs, kernel_size, stride=stride, dilation=dilation[0],
+            groups=groups, drop_block=drop_block, apply_act=False, **layer_args)
+        self.conv_1x1 = ConvBnAct(in_chs, out_chs, 1, stride=stride, groups=groups, apply_act=False, **layer_args)
+        self.attn = None if attn_layer is None else attn_layer(out_chs)
+        self.drop_path = DropPath(drop_path_rate) if drop_path_rate > 0. else nn.Identity()
+        self.act = act_layer(inplace=True)
+
+    def init_weights(self, zero_init_last_bn=False):
+        # NOTE this init overrides that base model init with specific changes for the block type
+        for m in self.modules():
+            if isinstance(m, nn.BatchNorm2d):
+                nn.init.normal_(m.weight, .1, .1)
+                nn.init.normal_(m.bias, 0, .1)
+
+    def forward(self, x):
+        if self.identity is None:
+            identity = 0
+        else:
+            identity = self.identity(x)
+        x = self.conv_1x1(x) + self.conv_kxk(x)
+        if self.attn is not None:
+            x = self.attn(x)
+        x = self.drop_path(x)
+        x = self.act(x + identity)
+        return x
+
+
+_block_registry = dict(
+    basic=BasicBlock,
+    bottle=BottleneckBlock,
+    dark=DarkBlock,
+    edge=EdgeBlock,
+    rep=RepVggBlock,
+)
+
+
+def register_block(block_type:str, block_fn: nn.Module):
+    _block_registry[block_type] = block_fn
+
+
+def create_block(block: Union[str, nn.Module], **kwargs):
+    if isinstance(block, (nn.Module, partial)):
+        return block(**kwargs)
+    assert block in _block_registry, f'Unknown block type ({block}'
+    return _block_registry[block](**kwargs)
+
+
+def create_stem(in_chs, out_chs, stem_type='', layer_cfg=None):
+    layer_cfg = layer_cfg or {}
+    layer_args = _na_args(layer_cfg)
+    assert stem_type in ('', 'deep', 'deep_tiered', '3x3', '7x7', 'rep')
+    if 'deep' in stem_type:
+        # 3 deep 3x3 conv stack
+        stem = OrderedDict()
+        stem_chs = (out_chs // 2, out_chs // 2)
+        if 'tiered' in stem_type:
+            stem_chs = (3 * stem_chs[0] // 4, stem_chs[1])
+        norm_layer, act_layer = _ex_tuple(layer_args, 'norm_layer', 'act_layer')
+        stem['conv1'] = create_conv2d(in_chs, stem_chs[0], kernel_size=3, stride=2)
+        stem['conv2'] = create_conv2d(stem_chs[0], stem_chs[1], kernel_size=3, stride=1)
+        stem['conv3'] = create_conv2d(stem_chs[1], out_chs, kernel_size=3, stride=1)
+        norm_act_layer = convert_norm_act(norm_layer=norm_layer, act_layer=act_layer)
+        stem['na'] = norm_act_layer(out_chs)
+        stem = nn.Sequential(stem)
+    elif '7x7' in stem_type:
+        # 7x7 stem conv as in ResNet
+        stem = ConvBnAct(in_chs, out_chs, 7, stride=2, **layer_args)
+    elif 'rep' in stem_type:
+        stem = RepVggBlock(in_chs, out_chs, stride=2, layer_cfg=layer_cfg)
+    else:
+        # 3x3 stem conv as in RegNet
+        stem = ConvBnAct(in_chs, out_chs, 3, stride=2, **layer_args)
+
+    return stem
+
+
+class ByobNet(nn.Module):
+    """ 'Bring-your-own-blocks' Net
+
+    A flexible network backbone that allows building model stem + blocks via
+    dataclass cfg definition w/ factory functions for module instantiation.
+
+    Current assumption is that both stem and blocks are in conv-bn-act order (w/ block ending in act).
+    """
+    def __init__(self, cfg: ByobCfg, num_classes=1000, in_chans=3, global_pool='avg', output_stride=32,
+                 zero_init_last_bn=True, drop_rate=0., drop_path_rate=0.):
+        super().__init__()
+        self.num_classes = num_classes
+        self.drop_rate = drop_rate
+        norm_layer = cfg.norm_layer
+        act_layer = get_act_layer(cfg.act_layer)
+        attn_layer = partial(get_attn(cfg.attn_layer), **cfg.attn_kwargs) if cfg.attn_layer else None
+        layer_cfg = dict(norm_layer=norm_layer, act_layer=act_layer, attn_layer=attn_layer)
+
+        stem_chs = int(round((cfg.stem_chs or cfg.blocks[0].c) * cfg.width_factor))
+        self.stem = create_stem(in_chans, stem_chs, cfg.stem_type, layer_cfg=layer_cfg)
+
+        self.feature_info = []
+        depths = [bc.d for bc in cfg.blocks]
+        dpr = [x.tolist() for x in torch.linspace(0, drop_path_rate, sum(depths)).split(depths)]
+        prev_name = 'stem'
+        prev_chs = stem_chs
+        net_stride = 2
+        dilation = 1
+        stages = []
+        for stage_idx, block_cfg in enumerate(cfg.blocks):
+            stride = block_cfg.s
+            if stride != 1:
+                self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=prev_name))
+            if net_stride >= output_stride and stride > 1:
+                dilation *= stride
+                stride = 1
+            net_stride *= stride
+            first_dilation = 1 if dilation in (1, 2) else 2
+
+            blocks = []
+            for block_idx in range(block_cfg.d):
+                out_chs = make_divisible(block_cfg.c * cfg.width_factor)
+                group_size = block_cfg.gs
+                if isinstance(group_size, Callable):
+                    group_size = group_size(out_chs, block_idx)
+                block_kwargs = dict(  # Blocks used in this model must accept these arguments
+                    in_chs=prev_chs,
+                    out_chs=out_chs,
+                    stride=stride if block_idx == 0 else 1,
+                    dilation=(first_dilation, dilation),
+                    group_size=group_size,
+                    bottle_ratio=block_cfg.br,
+                    downsample=cfg.downsample,
+                    drop_path_rate=dpr[stage_idx][block_idx],
+                    layer_cfg=layer_cfg,
+                )
+                blocks += [create_block(block_cfg.type, **block_kwargs)]
+                first_dilation = dilation
+                prev_chs = out_chs
+            stages += [nn.Sequential(*blocks)]
+            prev_name = f'stages.{stage_idx}'
+        self.stages = nn.Sequential(*stages)
+
+        if cfg.num_features:
+            self.num_features = int(round(cfg.width_factor * cfg.num_features))
+            self.final_conv = ConvBnAct(prev_chs, self.num_features, 1, **_na_args(layer_cfg))
+        else:
+            self.num_features = prev_chs
+            self.final_conv = nn.Identity()
+        self.feature_info += [dict(num_chs=self.num_features, reduction=net_stride, module='final_conv')]
+
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+        for n, m in self.named_modules():
+            if isinstance(m, nn.Conv2d):
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                fan_out //= m.groups
+                m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, mean=0.0, std=0.01)
+                nn.init.zeros_(m.bias)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
+        for m in self.modules():
+            # call each block's weight init for block-specific overrides to init above
+            if hasattr(m, 'init_weights'):
+                m.init_weights(zero_init_last_bn=zero_init_last_bn)
+
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.head = ClassifierHead(self.num_features, num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x):
+        x = self.stem(x)
+        x = self.stages(x)
+        x = self.final_conv(x)
+        return x
+
+    def forward(self, x):
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+
+def _create_byobnet(variant, pretrained=False, **kwargs):
+    return build_model_with_cfg(
+        ByobNet, variant, pretrained,
+        default_cfg=default_cfgs[variant],
+        model_cfg=model_cfgs[variant],
+        feature_cfg=dict(flatten_sequential=True),
+        **kwargs)
+
+
+@register_model
+def gernet_l(pretrained=False, **kwargs):
+    return _create_byobnet('gernet_l', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gernet_m(pretrained=False, **kwargs):
+    return _create_byobnet('gernet_m', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def gernet_s(pretrained=False, **kwargs):
+    return _create_byobnet('gernet_s', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_a2(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_a2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b0(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_b0', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b1(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_b1', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b1g4(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_b1g4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b2(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_b2', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b2g4(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_b2g4', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b3(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_b3', pretrained=pretrained, **kwargs)
+
+
+@register_model
+def repvgg_b3g4(pretrained=False, **kwargs):
+    return _create_byobnet('repvgg_b3g4', pretrained=pretrained, **kwargs)
diff --git a/timm/models/nfnet.py b/timm/models/nfnet.py
index 74c4fe33..c56c5780 100644
--- a/timm/models/nfnet.py
+++ b/timm/models/nfnet.py
@@ -395,8 +395,11 @@ def _create_normfreenet(variant, pretrained=False, **kwargs):
         feature_cfg['out_indices'] = (1, 2, 3, 4)  # no stride 2, 0 level feat for stride 4 maxpool stems in ResNet
 
     return build_model_with_cfg(
-        NormalizerFreeNet, variant, pretrained, model_cfg=model_cfg, default_cfg=default_cfgs[variant],
-        feature_cfg=feature_cfg, **kwargs)
+        NormalizerFreeNet, variant, pretrained,
+        default_cfg=default_cfgs[variant],
+        model_cfg=model_cfg,
+        feature_cfg=feature_cfg,
+        **kwargs)
 
 
 @register_model
diff --git a/timm/models/vgg.py b/timm/models/vgg.py
new file mode 100644
index 00000000..ceede650
--- /dev/null
+++ b/timm/models/vgg.py
@@ -0,0 +1,261 @@
+"""VGG
+
+Adapted from https://github.com/pytorch/vision 'vgg.py' (BSD-3-Clause) with a few changes for
+timm functionality.
+
+Copyright 2021 Ross Wightman
+"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Union, List, Dict, Any, cast
+
+from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
+from .helpers import build_model_with_cfg
+from .layers import ClassifierHead, ConvBnAct
+from .registry import register_model
+
+__all__ = [
+    'VGG', 'vgg11', 'vgg11_bn', 'vgg13', 'vgg13_bn', 'vgg16', 'vgg16_bn',
+    'vgg19_bn', 'vgg19',
+]
+
+
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': (1, 1),
+        'crop_pct': 0.875, 'interpolation': 'bilinear',
+        'mean': IMAGENET_DEFAULT_MEAN, 'std': IMAGENET_DEFAULT_STD,
+        'first_conv': 'features.0', 'classifier': 'head.fc',
+        **kwargs
+    }
+
+
+default_cfgs = {
+    'vgg11': _cfg(url='https://download.pytorch.org/models/vgg11-bbd30ac9.pth'),
+    'vgg13': _cfg(url='https://download.pytorch.org/models/vgg13-c768596a.pth'),
+    'vgg16': _cfg(url='https://download.pytorch.org/models/vgg16-397923af.pth'),
+    'vgg19': _cfg(url='https://download.pytorch.org/models/vgg19-dcbb9e9d.pth'),
+    'vgg11_bn': _cfg(url='https://download.pytorch.org/models/vgg11_bn-6002323d.pth'),
+    'vgg13_bn': _cfg(url='https://download.pytorch.org/models/vgg13_bn-abd245e5.pth'),
+    'vgg16_bn': _cfg(url='https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'),
+    'vgg19_bn': _cfg(url='https://download.pytorch.org/models/vgg19_bn-c79401a0.pth'),
+}
+
+
+cfgs: Dict[str, List[Union[str, int]]] = {
+    'vgg11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'vgg13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
+    'vgg16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
+    'vgg19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
+}
+
+
+class ConvMlp(nn.Module):
+
+    def __init__(self, in_features=512, out_features=4096, kernel_size=7, mlp_ratio=1.0,
+                 drop_rate: float = 0.2, act_layer: nn.Module = None, conv_layer: nn.Module = None):
+        super(ConvMlp, self).__init__()
+        self.input_kernel_size = kernel_size
+        mid_features = int(out_features * mlp_ratio)
+        self.fc1 = conv_layer(in_features, mid_features, kernel_size, bias=True)
+        self.act1 = act_layer(True)
+        self.drop = nn.Dropout(drop_rate)
+        self.fc2 = conv_layer(mid_features, out_features, 1, bias=True)
+        self.act2 = act_layer(True)
+
+    def forward(self, x):
+        if x.shape[-2] < self.input_kernel_size or x.shape[-1] < self.input_kernel_size:
+            # keep the input size >= 7x7
+            output_size = (max(self.input_kernel_size, x.shape[-2]), max(self.input_kernel_size, x.shape[-1]))
+            x = F.adaptive_avg_pool2d(x, output_size)
+        x = self.fc1(x)
+        x = self.act1(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.act2(x)
+        return x
+
+
+class VGG(nn.Module):
+
+    def __init__(
+        self,
+        cfg: List[Any],
+        num_classes: int = 1000,
+        in_chans: int = 3,
+        output_stride: int = 32,
+        mlp_ratio: float = 1.0,
+        act_layer: nn.Module = nn.ReLU,
+        conv_layer: nn.Module = nn.Conv2d,
+        norm_layer: nn.Module = None,
+        global_pool: str = 'avg',
+        drop_rate: float = 0.,
+    ) -> None:
+        super(VGG, self).__init__()
+        assert output_stride == 32
+        self.num_classes = num_classes
+        self.num_features = 4096
+        self.drop_rate = drop_rate
+        self.feature_info = []
+        prev_chs = in_chans
+        net_stride = 1
+        pool_layer = nn.MaxPool2d
+        layers: List[nn.Module] = []
+        for v in cfg:
+            last_idx = len(layers) - 1
+            if v == 'M':
+                self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{last_idx}'))
+                layers += [pool_layer(kernel_size=2, stride=2)]
+                net_stride *= 2
+            else:
+                v = cast(int, v)
+                conv2d = conv_layer(prev_chs, v, kernel_size=3, padding=1)
+                if norm_layer is not None:
+                    layers += [conv2d, norm_layer(v), act_layer(inplace=True)]
+                else:
+                    layers += [conv2d, act_layer(inplace=True)]
+                prev_chs = v
+        self.features = nn.Sequential(*layers)
+        self.feature_info.append(dict(num_chs=prev_chs, reduction=net_stride, module=f'features.{len(layers) - 1}'))
+        self.pre_logits = ConvMlp(
+            prev_chs, self.num_features, 7, mlp_ratio=mlp_ratio,
+            drop_rate=drop_rate, act_layer=act_layer, conv_layer=conv_layer)
+        self.head = ClassifierHead(
+            self.num_features, num_classes, pool_type=global_pool, drop_rate=drop_rate)
+
+        self._initialize_weights()
+
+    def get_classifier(self):
+        return self.head.fc
+
+    def reset_classifier(self, num_classes, global_pool='avg'):
+        self.num_classes = num_classes
+        self.head = ClassifierHead(
+            self.num_features, self.num_classes, pool_type=global_pool, drop_rate=self.drop_rate)
+
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.features(x)
+        x = self.pre_logits(x)
+        return x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        x = self.head(x)
+        return x
+
+    def _initialize_weights(self) -> None:
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+            elif isinstance(m, nn.Linear):
+                nn.init.normal_(m.weight, 0, 0.01)
+                nn.init.constant_(m.bias, 0)
+
+
+def _filter_fn(state_dict):
+    """ convert patch embedding weight from manual patchify + linear proj to conv"""
+    out_dict = {}
+    for k, v in state_dict.items():
+        k_r = k
+        k_r = k_r.replace('classifier.0', 'pre_logits.fc1')
+        k_r = k_r.replace('classifier.3', 'pre_logits.fc2')
+        k_r = k_r.replace('classifier.6', 'head.fc')
+        if 'classifier.0.weight' in k:
+            v = v.reshape(-1, 512, 7, 7)
+        if 'classifier.3.weight' in k:
+            v = v.reshape(-1, 4096, 1, 1)
+        out_dict[k_r] = v
+    return out_dict
+
+
+def _create_vgg(variant: str, pretrained: bool, **kwargs: Any) -> VGG:
+    cfg = variant.split('_')[0]
+    # NOTE: VGG is one of the only models with stride==1 features, so indices are offset from other models
+    out_indices = kwargs.get('out_indices', (0, 1, 2, 3, 4, 5))
+    model = build_model_with_cfg(
+        VGG, variant, pretrained=pretrained,
+        model_cfg=cfgs[cfg],
+        default_cfg=default_cfgs[variant],
+        feature_cfg=dict(flatten_sequential=True, out_indices=out_indices),
+        pretrained_filter_fn=_filter_fn,
+        **kwargs)
+    return model
+
+
+@register_model
+def vgg11(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 11-layer model (configuration "A") from
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg11', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg11_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 11-layer model (configuration "A") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg11_bn', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg13(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 13-layer model (configuration "B")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg13', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg13_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 13-layer model (configuration "B") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg13_bn', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg16(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 16-layer model (configuration "D")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg16', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg16_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 16-layer model (configuration "D") with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg16_bn', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg19(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 19-layer model (configuration "E")
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(**kwargs)
+    return _create_vgg('vgg19', pretrained=pretrained, **model_args)
+
+
+@register_model
+def vgg19_bn(pretrained: bool = False, **kwargs: Any) -> VGG:
+    r"""VGG 19-layer model (configuration 'E') with batch normalization
+    `"Very Deep Convolutional Networks For Large-Scale Image Recognition" <https://arxiv.org/pdf/1409.1556.pdf>`._
+    """
+    model_args = dict(norm_layer=nn.BatchNorm2d, **kwargs)
+    return _create_vgg('vgg19_bn', pretrained=pretrained, **model_args)
\ No newline at end of file