diff --git a/timm/models/regnet.py b/timm/models/regnet.py index 59554c09..68bb817c 100644 --- a/timm/models/regnet.py +++ b/timm/models/regnet.py @@ -195,7 +195,7 @@ class RegStage(nn.Module): """Stage (sequence of blocks w/ the same output shape).""" def __init__(self, in_chs, out_chs, stride, dilation, depth, bottle_ratio, group_width, - block_fn=Bottleneck, se_ratio=0., drop_path_rate=None, drop_block=None): + block_fn=Bottleneck, se_ratio=0., drop_path_rates=None, drop_block=None): super(RegStage, self).__init__() block_kwargs = {} # FIXME setup to pass various aa, norm, act layer common args first_dilation = 1 if dilation in (1, 2) else 2 @@ -203,7 +203,10 @@ class RegStage(nn.Module): block_stride = stride if i == 0 else 1 block_in_chs = in_chs if i == 0 else out_chs block_dilation = first_dilation if i == 0 else dilation - drop_path = DropPath(drop_path_rate[i]) if drop_path_rate is not None else None + if drop_path_rates is not None and drop_path_rates[i] > 0.: + drop_path = DropPath(drop_path_rates[i]) + else: + drop_path = None if (block_in_chs != out_chs) or (block_stride != 1): proj_block = downsample_conv(block_in_chs, out_chs, 1, block_stride, block_dilation) else: @@ -301,7 +304,7 @@ class RegNet(nn.Module): # Adjust the compatibility of ws and gws stage_widths, stage_groups = adjust_widths_groups_comp(stage_widths, stage_bottle_ratios, stage_groups) - param_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_width', 'drop_path_rate'] + param_names = ['out_chs', 'stride', 'dilation', 'depth', 'bottle_ratio', 'group_width', 'drop_path_rates'] stage_params = [ dict(zip(param_names, params)) for params in zip(stage_widths, stage_strides, stage_dilations, stage_depths, stage_bottle_ratios, stage_groups, diff --git a/timm/models/rexnet.py b/timm/models/rexnet.py index 24dfd843..a8161836 100644 --- a/timm/models/rexnet.py +++ b/timm/models/rexnet.py @@ -15,7 +15,7 @@ from math import ceil from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .helpers import build_model_with_cfg -from .layers import ClassifierHead, create_act_layer, ConvBnAct +from .layers import ClassifierHead, create_act_layer, ConvBnAct, DropPath from .registry import register_model @@ -56,10 +56,10 @@ def make_divisible(v, divisor=8, min_value=None): class SEWithNorm(nn.Module): - def __init__(self, channels, reduction=16, act_layer=nn.ReLU, divisor=1, reduction_channels=None, + def __init__(self, channels, se_ratio=1 / 12., act_layer=nn.ReLU, divisor=1, reduction_channels=None, gate_layer='sigmoid'): super(SEWithNorm, self).__init__() - reduction_channels = reduction_channels or make_divisible(channels // reduction, divisor=divisor) + reduction_channels = reduction_channels or make_divisible(int(channels * se_ratio), divisor=divisor) self.fc1 = nn.Conv2d(channels, reduction_channels, kernel_size=1, bias=True) self.bn = nn.BatchNorm2d(reduction_channels) self.act = act_layer(inplace=True) @@ -76,7 +76,7 @@ class SEWithNorm(nn.Module): class LinearBottleneck(nn.Module): - def __init__(self, in_chs, out_chs, stride, exp_ratio=1.0, use_se=True, se_rd=12, ch_div=1): + def __init__(self, in_chs, out_chs, stride, exp_ratio=1.0, se_ratio=0., ch_div=1, drop_path=None): super(LinearBottleneck, self).__init__() self.use_shortcut = stride == 1 and in_chs <= out_chs self.in_channels = in_chs @@ -90,10 +90,11 @@ class LinearBottleneck(nn.Module): self.conv_exp = None self.conv_dw = ConvBnAct(dw_chs, dw_chs, 3, stride=stride, groups=dw_chs, apply_act=False) - self.se = SEWithNorm(dw_chs, reduction=se_rd, divisor=ch_div) if use_se else None + self.se = SEWithNorm(dw_chs, se_ratio=se_ratio, divisor=ch_div) if se_ratio > 0. else None self.act_dw = nn.ReLU6() self.conv_pwl = ConvBnAct(dw_chs, out_chs, 1, apply_act=False) + self.drop_path = drop_path def feat_channels(self, exp=False): return self.conv_dw.out_channels if exp else self.out_channels @@ -107,12 +108,14 @@ class LinearBottleneck(nn.Module): x = self.se(x) x = self.act_dw(x) x = self.conv_pwl(x) + if self.drop_path is not None: + x = self.drop_path(x) if self.use_shortcut: x[:, 0:self.in_channels] += shortcut return x -def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, use_se=True, ch_div=1): +def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, se_ratio=0., ch_div=1): layers = [1, 2, 2, 3, 3, 5] strides = [1, 2, 2, 2, 1, 2] layers = [ceil(element * depth_mult) for element in layers] @@ -127,29 +130,31 @@ def _block_cfg(width_mult=1.0, depth_mult=1.0, initial_chs=16, final_chs=180, us out_chs_list.append(make_divisible(round(base_chs * width_mult), divisor=ch_div)) base_chs += final_chs / (depth // 3 * 1.0) - if use_se: - use_ses = [False] * (layers[0] + layers[1]) + [True] * sum(layers[2:]) - else: - use_ses = [False] * sum(layers[:]) + se_ratios = [0.] * (layers[0] + layers[1]) + [se_ratio] * sum(layers[2:]) - return zip(out_chs_list, exp_ratios, strides, use_ses) + return list(zip(out_chs_list, exp_ratios, strides, se_ratios)) -def _build_blocks(block_cfg, prev_chs, width_mult, se_rd=12, ch_div=1, feature_location='bottleneck'): +def _build_blocks( + block_cfg, prev_chs, width_mult, ch_div=1, drop_path_rate=0., feature_location='bottleneck'): feat_exp = feature_location == 'expansion' feat_chs = [prev_chs] feature_info = [] curr_stride = 2 features = [] - for block_idx, (chs, exp_ratio, stride, se) in enumerate(block_cfg): + num_blocks = len(block_cfg) + for block_idx, (chs, exp_ratio, stride, se_ratio) in enumerate(block_cfg): if stride > 1: fname = 'stem' if block_idx == 0 else f'features.{block_idx - 1}' if block_idx > 0 and feat_exp: fname += '.act_dw' feature_info += [dict(num_chs=feat_chs[-1], reduction=curr_stride, module=fname)] curr_stride *= stride + block_dpr = drop_path_rate * block_idx / (num_blocks - 1) # stochastic depth linear decay rule + drop_path = DropPath(block_dpr) if block_dpr > 0. else None features.append(LinearBottleneck( - in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, use_se=se, se_rd=se_rd, ch_div=ch_div)) + in_chs=prev_chs, out_chs=chs, exp_ratio=exp_ratio, stride=stride, se_ratio=se_ratio, + ch_div=ch_div, drop_path=drop_path)) prev_chs = chs feat_chs += [features[-1].feat_channels(feat_exp)] pen_chs = make_divisible(1280 * width_mult, divisor=ch_div) @@ -162,8 +167,8 @@ def _build_blocks(block_cfg, prev_chs, width_mult, se_rd=12, ch_div=1, feature_l class ReXNetV1(nn.Module): def __init__(self, in_chans=3, num_classes=1000, global_pool='avg', output_stride=32, - initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, use_se=True, - se_rd=12, ch_div=1, drop_rate=0.2, feature_location='bottleneck'): + initial_chs=16, final_chs=180, width_mult=1.0, depth_mult=1.0, se_ratio=1/12., + ch_div=1, drop_rate=0.2, drop_path_rate=0., feature_location='bottleneck'): super(ReXNetV1, self).__init__() self.drop_rate = drop_rate self.num_classes = num_classes @@ -173,9 +178,9 @@ class ReXNetV1(nn.Module): stem_chs = make_divisible(round(stem_base_chs * width_mult), divisor=ch_div) self.stem = ConvBnAct(in_chans, stem_chs, 3, stride=2, act_layer='swish') - block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, use_se, ch_div) + block_cfg = _block_cfg(width_mult, depth_mult, initial_chs, final_chs, se_ratio, ch_div) features, self.feature_info = _build_blocks( - block_cfg, stem_chs, width_mult, se_rd, ch_div, feature_location) + block_cfg, stem_chs, width_mult, ch_div, drop_path_rate, feature_location) self.num_features = features[-1].out_channels self.features = nn.Sequential(*features) diff --git a/timm/models/vovnet.py b/timm/models/vovnet.py index c1183b37..f544433c 100644 --- a/timm/models/vovnet.py +++ b/timm/models/vovnet.py @@ -20,7 +20,7 @@ import torch.nn.functional as F from timm.data import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD from .registry import register_model from .helpers import build_model_with_cfg -from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, ClassifierHead, \ +from .layers import ConvBnAct, SeparableConvBnAct, BatchNormAct2d, ClassifierHead, DropPath,\ create_attn, create_norm_act, get_norm_act_layer @@ -179,7 +179,7 @@ class SequentialAppendList(nn.Sequential): class OsaBlock(nn.Module): def __init__(self, in_chs, mid_chs, out_chs, layer_per_block, residual=False, - depthwise=False, attn='', norm_layer=BatchNormAct2d, act_layer=nn.ReLU): + depthwise=False, attn='', norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path=None): super(OsaBlock, self).__init__() self.residual = residual @@ -212,6 +212,8 @@ class OsaBlock(nn.Module): else: self.attn = None + self.drop_path = drop_path + def forward(self, x): output = [x] if self.conv_reduction is not None: @@ -220,6 +222,8 @@ class OsaBlock(nn.Module): x = self.conv_concat(x) if self.attn is not None: x = self.attn(x) + if self.drop_path is not None: + x = self.drop_path(x) if self.residual: x = x + output[0] return x @@ -228,7 +232,8 @@ class OsaBlock(nn.Module): class OsaStage(nn.Module): def __init__(self, in_chs, mid_chs, out_chs, block_per_stage, layer_per_block, downsample=True, - residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d, act_layer=nn.ReLU): + residual=True, depthwise=False, attn='ese', norm_layer=BatchNormAct2d, act_layer=nn.ReLU, + drop_path_rates=None): super(OsaStage, self).__init__() if downsample: @@ -239,10 +244,15 @@ class OsaStage(nn.Module): blocks = [] for i in range(block_per_stage): last_block = i == block_per_stage - 1 + if drop_path_rates is not None and drop_path_rates[i] > 0.: + drop_path = DropPath(drop_path_rates[i]) + else: + drop_path = None blocks += [OsaBlock( - in_chs if i == 0 else out_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0, - depthwise=depthwise, attn=attn if last_block else '', norm_layer=norm_layer, act_layer=act_layer) + in_chs, mid_chs, out_chs, layer_per_block, residual=residual and i > 0, depthwise=depthwise, + attn=attn if last_block else '', norm_layer=norm_layer, act_layer=act_layer, drop_path=drop_path) ] + in_chs = out_chs self.blocks = nn.Sequential(*blocks) def forward(self, x): @@ -255,7 +265,7 @@ class OsaStage(nn.Module): class VovNet(nn.Module): def __init__(self, cfg, in_chans=3, num_classes=1000, global_pool='avg', drop_rate=0., stem_stride=4, - output_stride=32, norm_layer=BatchNormAct2d, act_layer=nn.ReLU): + output_stride=32, norm_layer=BatchNormAct2d, act_layer=nn.ReLU, drop_path_rate=0.): """ VovNet (v2) """ super(VovNet, self).__init__() @@ -284,6 +294,7 @@ class VovNet(nn.Module): current_stride = stem_stride # OSA stages + stage_dpr = torch.split(torch.linspace(0, drop_path_rate, sum(block_per_stage)), block_per_stage) in_ch_list = stem_chs[-1:] + stage_out_chs[:-1] stage_args = dict(residual=cfg["residual"], depthwise=cfg["depthwise"], attn=cfg["attn"], **conv_kwargs) stages = [] @@ -291,7 +302,7 @@ class VovNet(nn.Module): downsample = stem_stride == 2 or i > 0 # first stage has no stride/downsample if stem_stride is 4 stages += [OsaStage( in_ch_list[i], stage_conv_chs[i], stage_out_chs[i], block_per_stage[i], layer_per_block, - downsample=downsample, **stage_args) + downsample=downsample, drop_path_rates=stage_dpr[i], **stage_args) ] self.num_features = stage_out_chs[i] current_stride *= 2 if downsample else 1