From 9d3f0b1afd78f454f5530dac5bde9a6e8a34c118 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 00:15:55 +0000 Subject: [PATCH 01/40] initial code --- torchvision/models/regnet.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 torchvision/models/regnet.py diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py new file mode 100644 index 00000000000..96493ae3c4c --- /dev/null +++ b/torchvision/models/regnet.py @@ -0,0 +1,13 @@ +from torch import nn + +class RegNetParams: + pass + +class SqueezeExcitation(nn.Module): + pass + +class RegNet(nn.Module): + pass + +def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + pass From e797fcab7074c7aaabed4726cdf9e098db9d317f Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 05:08:36 +0000 Subject: [PATCH 02/40] add SqueezeExcitation --- torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 96493ae3c4c..eff8273695a 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,10 +1,46 @@ -from torch import nn +from torch import nn, Tenspr class RegNetParams: pass class SqueezeExcitation(nn.Module): - pass + """ + Squeeze and excitation layer from + `"Squeeze-and-Excitation Networks" `_. + """ + + def __init__( + self, + in_channels: int, + reduction_ratio: Optional[int] = 16, + reduced_channels: Optional[int] = None, + activation: Optional[nn.Module] = None, + ) -> None: + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + + # Either reduction_ratio is defined, or out_channels is defined, + # neither both nor none of them + assert bool(reduction_ratio) != bool(reduced_channels) + + if activation is None: + activation = nn.ReLU() + + reduced_channels = ( + in_channels // reduction_ratio if reduced_channels is None else reduced_channels + ) + self.excitation = nn.Sequential( + nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), + activation, + nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), + nn.Sigmoid(), + ) + + def forward(self, x: Tensor) -> Tensor: + x_squeezed = self.avgpool(x) + x_excited = self.excitation(x_squeezed) + x_scaled = x * x_excited + return x_scaled class RegNet(nn.Module): pass From 692fbaaa536bd50436c66efb5a2a8623c1d74285 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 00:15:55 +0000 Subject: [PATCH 03/40] initial code --- torchvision/models/regnet.py | 40 ++---------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index eff8273695a..96493ae3c4c 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,46 +1,10 @@ -from torch import nn, Tenspr +from torch import nn class RegNetParams: pass class SqueezeExcitation(nn.Module): - """ - Squeeze and excitation layer from - `"Squeeze-and-Excitation Networks" `_. - """ - - def __init__( - self, - in_channels: int, - reduction_ratio: Optional[int] = 16, - reduced_channels: Optional[int] = None, - activation: Optional[nn.Module] = None, - ) -> None: - super().__init__() - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - - # Either reduction_ratio is defined, or out_channels is defined, - # neither both nor none of them - assert bool(reduction_ratio) != bool(reduced_channels) - - if activation is None: - activation = nn.ReLU() - - reduced_channels = ( - in_channels // reduction_ratio if reduced_channels is None else reduced_channels - ) - self.excitation = nn.Sequential( - nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), - activation, - nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), - nn.Sigmoid(), - ) - - def forward(self, x: Tensor) -> Tensor: - x_squeezed = self.avgpool(x) - x_excited = self.excitation(x_squeezed) - x_scaled = x * x_excited - return x_scaled + pass class RegNet(nn.Module): pass From eb6fb9f28fca11fbef5b633607ae1b71729eed8f Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 05:08:36 +0000 Subject: [PATCH 04/40] add SqueezeExcitation --- torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 96493ae3c4c..eff8273695a 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,10 +1,46 @@ -from torch import nn +from torch import nn, Tenspr class RegNetParams: pass class SqueezeExcitation(nn.Module): - pass + """ + Squeeze and excitation layer from + `"Squeeze-and-Excitation Networks" `_. + """ + + def __init__( + self, + in_channels: int, + reduction_ratio: Optional[int] = 16, + reduced_channels: Optional[int] = None, + activation: Optional[nn.Module] = None, + ) -> None: + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + + # Either reduction_ratio is defined, or out_channels is defined, + # neither both nor none of them + assert bool(reduction_ratio) != bool(reduced_channels) + + if activation is None: + activation = nn.ReLU() + + reduced_channels = ( + in_channels // reduction_ratio if reduced_channels is None else reduced_channels + ) + self.excitation = nn.Sequential( + nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), + activation, + nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), + nn.Sigmoid(), + ) + + def forward(self, x: Tensor) -> Tensor: + x_squeezed = self.avgpool(x) + x_excited = self.excitation(x_squeezed) + x_scaled = x * x_excited + return x_scaled class RegNet(nn.Module): pass From 88840c30bf42782a56da831848d57da0bdb9c785 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Fri, 10 Sep 2021 23:21:17 +0000 Subject: [PATCH 05/40] add SqueezeExcitation --- torchvision/models/regnet.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index eff8273695a..aa41e3096c2 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,11 +1,18 @@ from torch import nn, Tenspr +from torchvision.models.mobilenetv2 import _make_divisible + + +model_urls = { +} + class RegNetParams: pass + class SqueezeExcitation(nn.Module): """ - Squeeze and excitation layer from + Squeeze and excitation layer from `"Squeeze-and-Excitation Networks" `_. """ @@ -42,8 +49,20 @@ def forward(self, x: Tensor) -> Tensor: x_scaled = x * x_excited return x_scaled + class RegNet(nn.Module): pass + +def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: + model = RegNet() + if pretrained: + if arch not in model_urls: + raise ValueError(f"No checkpoint is available for model type {arch}") + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - pass + return _regnet("regnet_y_400mf", pretrained, progress, **kwargs) From 8bde15a60050a9d1a44520de9050338655609b8d Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 14 Sep 2021 05:57:30 +0000 Subject: [PATCH 06/40] regnet blocks, stems and model definition --- torchvision/models/__init__.py | 1 + torchvision/models/regnet.py | 563 ++++++++++++++++++++++++++++++++- 2 files changed, 557 insertions(+), 7 deletions(-) diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py index e57f4773c8c..07ccf8de7f5 100644 --- a/torchvision/models/__init__.py +++ b/torchvision/models/__init__.py @@ -9,6 +9,7 @@ from .mnasnet import * from .shufflenetv2 import * from .efficientnet import * +from .regnet import * from . import segmentation from . import detection from . import video diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index aa41e3096c2..eb15da6e0ad 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,4 +1,11 @@ -from torch import nn, Tenspr +import numpy as np +import math +import torch + +from collections import OrderedDict +from enum import Enum, auto +from typing import Any, List, Optional +from torch import nn, Tensor from torchvision.models.mobilenetv2 import _make_divisible @@ -6,11 +13,140 @@ } +# The different possible blocks +class BlockType(Enum): + VANILLA_BLOCK = auto() + RES_BASIC_BLOCK = auto() + RES_BOTTLENECK_BLOCK = auto() + RES_BOTTLENECK_LINEAR_BLOCK = auto() + + +# The different possible Stems +class StemType(Enum): + RES_STEM_CIFAR = auto() + RES_STEM_IN = auto() + SIMPLE_STEM_IN = auto() + + +# The different possible activations +class ActivationType(Enum): + RELU = auto() + SILU = auto() + + class RegNetParams: - pass + def __init__( + self, + depth: int, + w_0: int, + w_a: float, + w_m: float, + group_width: int, + bottleneck_multiplier: float = 1.0, + stem_type: StemType = StemType.SIMPLE_STEM_IN, + stem_width: int = 32, + block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK, + activation: ActivationType = ActivationType.RELU, + use_se: bool = True, + se_ratio: float = 0.25, + bn_epsilon: float = 1e-05, + bn_momentum: bool = 0.1, + ) -> None: + if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: + raise ValueError("Invalid RegNet settings") + self.depth = depth + self.w_0 = w_0 + self.w_a = w_a + self.w_m = w_m + self.group_width = group_width + self.bottleneck_multiplier = bottleneck_multiplier + self.stem_type = stem_type + self.block_type = block_type + self.activation = activation + self.stem_width = stem_width + self.use_se = use_se + self.se_ratio = se_ratio if use_se else None + self.bn_epsilon = bn_epsilon + self.bn_momentum = bn_momentum + + def get_expanded_params(self): + """ + Programatically compute all the per-block settings, + given the RegNet parameters. + + The first step is to compute the quantized linear block parameters, + in log space. Key parameters are: + - `w_a` is the width progression slope + - `w_0` is the initial width + - `w_m` is the width stepping in the log space + + In other terms + `log(block_width) = log(w_0) + w_m * block_capacity`, + with `bock_capacity` ramping up following the w_0 and w_a params. + This block width is finally quantized to multiples of 8. + + The second step is to compute the parameters per stage, + taking into account the skip connection and the final 1x1 convolutions. + We use the fact that the output width is constant within a stage. + """ + + QUANT = 8 + STRIDE = 2 + + # Compute the block widths. Each stage has one unique block width + widths_cont = np.arange(self.depth) * self.w_a + self.w_0 + block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) + block_widths = ( + np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) + * QUANT + ) + num_stages = len(np.unique(block_widths)) + block_widths = block_widths.astype(int).tolist() + + # Convert to per stage parameters + split_helper = zip( + block_widths + [0], + [0] + block_widths, + block_widths + [0], + [0] + block_widths, + ) + splits = [w != wp or r != rp for w, wp, r, rp in split_helper] + + stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] + stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() + strides = [STRIDE] * num_stages + bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages + group_widths = [self.group_width] * num_stages -class SqueezeExcitation(nn.Module): + # Adjust the compatibility of stage widths and group widths + stage_widths, group_widths = self._adjust_widths_groups_compatibilty( + stage_widths, bottleneck_multipliers, group_widths + ) + + return zip( + stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers + ) + + @staticmethod + def _adjust_widths_groups_compatibilty( + stage_widths: List[int], bottleneck_ratios: List[float], + group_widths: List[int]) -> Tuple(List[int], List[int]): + """ + Adjusts the compatibility of widths and groups, + depending on the bottleneck ratio. + """ + # Compute all widths for the current settings + widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] + group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] + + # Compute the adjusted widths so that stage and group widths fit + ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] + stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] + return stage_widths, group_widths_min + + +class _SqueezeExcitation(nn.Module): """ Squeeze and excitation layer from `"Squeeze-and-Excitation Networks" `_. @@ -50,12 +186,424 @@ def forward(self, x: Tensor) -> Tensor: return x_scaled +class BasicTransform(nn.Sequential): + """Basic transformation: [3x3 conv, BN, Relu] x2.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + + self.a = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), + ) + + self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.depth = 2 + + +class ResStemCifar(nn.Sequential): + """ResNet stem for CIFAR: 3x3, BN, ReLU.""" + + def __init__( + self, + width_in: int, + width_out: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + self.depth = 2 + + +class ResStemIN(nn.Sequential): + """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool.""" + + def __init__( + self, + width_in: int, + width_out: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + nn.MaxPool2d(3, stride=2, padding=1), + ) + self.depth = 3 + + +class SimpleStemIN(nn.Sequential): + """Simple stem for ImageNet: 3x3, BN, ReLU.""" + + def __init__( + self, + width_in: int, + width_out: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + self.depth = 2 + + +class VanillaBlock(nn.Sequential): + """Vanilla block: [3x3 conv, BN, Relu] x2.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + *args, + **kwargs, + ) -> None: + super().__init__() + self.a = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + self.b = nn.Sequential( + nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + self.depth = 2 + + +class ResBasicBlock(nn.Module): + """Residual basic block: x + F(x), F = basic transform.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + *args, + **kwargs, + ) -> None: + super().__init__() + self.proj_block = (width_in != width_out) or (stride != 1) + if self.proj_block: + self.proj = nn.Conv2d( + width_in, width_out, 1, stride=stride, padding=0, bias=False + ) + self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.f = BasicTransform( + width_in, width_out, stride, bn_epsilon, bn_momentum, activation + ) + self.activation = activation + + # The projection and transform happen in parallel, + # and ReLU is not counted with respect to depth + self.depth = self.f.depth + + def forward(self, x: Tensor) -> Tensor: + if self.proj_block: + x = self.bn(self.proj(x)) + self.f(x) + else: + x = x + self.f(x) + + return self.activation(x) + + +class BottleneckTransform(nn.Sequential): + """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + group_width: int, + bottleneck_multiplier: float, + se_ratio: Optional[float], + ) -> None: + super().__init__() + w_b = int(round(width_out * bottleneck_multiplier)) + g = w_b // group_width + + self.a = nn.Sequential( + nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + self.b = nn.Sequential( + nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False), + nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + if se_ratio: + # The SE reduction ratio is defined with respect to the + # beginning of the block + width_se_out = int(round(se_ratio * width_in)) + self.se = _SqueezeExcitation( + in_channels=w_b, + reduction_ratio=None, + reduced_channels=width_se_out, + activation=activation, + ) + + self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) + self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.depth = 3 if not se_ratio else 4 + + +class ResBottleneckBlock(nn.Module): + """Residual bottleneck block: x + F(x), F = bottleneck transform.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + group_width: int = 1, + bottleneck_multiplier: float = 1.0, + se_ratio: Optional[float] = None, + ) -> None: + super().__init__() + + # Use skip connection with projection if shape changes + self.proj_block = (width_in != width_out) or (stride != 1) + if self.proj_block: + self.proj = nn.Conv2d( + width_in, width_out, 1, stride=stride, padding=0, bias=False + ) + self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.f = BottleneckTransform( + width_in, + width_out, + stride, + bn_epsilon, + bn_momentum, + activation, + group_width, + bottleneck_multiplier, + se_ratio, + ) + self.activation = activation + + # The projection and transform happen in parallel, + # and activation is not counted with respect to depth + self.depth = self.f.depth + + def forward(self, x: Tensor) -> Tensor: + if self.proj_block: + x = self.bn(self.proj(x)) + self.f(x) + else: + x = x + self.f(x) + return self.activation(x) + + +class ResBottleneckLinearBlock(nn.Module): + """Residual linear bottleneck block: x + F(x), F = bottleneck transform.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + group_width: int = 1, + bottleneck_multiplier: float = 4.0, + se_ratio: Optional[float] = None, + ) -> None: + super().__init__() + self.has_skip = (width_in == width_out) and (stride == 1) + self.f = BottleneckTransform( + width_in, + width_out, + stride, + bn_epsilon, + bn_momentum, + activation, + group_width, + bottleneck_multiplier, + se_ratio, + ) + + self.depth = self.f.depth + + def forward(self, x: Tensor) -> Tensor: + return x + self.f(x) if self.has_skip else self.f(x) + + +class AnyStage(nn.Sequential): + """AnyNet stage (sequence of blocks w/ the same output shape).""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + depth: int, + block_constructor: nn.Module, + activation: nn.Module, + group_width: int, + bottleneck_multiplier: float, + params: "AnyNetParams", + stage_index: int = 0, + ) -> None: + super().__init__() + self.stage_depth = 0 + + for i in range(depth): + block = block_constructor( + width_in if i == 0 else width_out, + width_out, + stride if i == 0 else 1, + params.bn_epsilon, + params.bn_momentum, + activation, + group_width, + bottleneck_multiplier, + params.se_ratio, + ) + + self.stage_depth += block.depth + self.add_module(f"block{stage_index}-{i}", block) + + class RegNet(nn.Module): - pass + def __init__(self, params: RegNetParams) -> None: + super().__init__() + + if params.activation == ActivationType.SILU and torch.__version__ < "1.7": + raise ValueError("SiLU activation is only supported since PyTorch 1.7") + + silu = None if torch.__version__ < "1.7" else nn.SiLU() + activation = { + ActivationType.RELU: nn.ReLU(inplace=True), + ActivationType.SILU: silu, + }[params.activation] + + # Ad hoc stem + self.stem = { + StemType.RES_STEM_CIFAR: ResStemCifar, + StemType.RES_STEM_IN: ResStemIN, + StemType.SIMPLE_STEM_IN: SimpleStemIN, + }[params.stem_type]( + 3, # width_in + params.stem_width, + params.bn_epsilon, + params.bn_momentum, + activation, + ) + + # Instantiate all the AnyNet blocks in the trunk + block_fun = { + BlockType.VANILLA_BLOCK: VanillaBlock, + BlockType.RES_BASIC_BLOCK: ResBasicBlock, + BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock, + BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock, + }[params.block_type] + + current_width = params.stem_width + + self.trunk_depth = 0 + + blocks = [] + for i, ( + width_out, + stride, + depth, + group_width, + bottleneck_multiplier, + ) in enumerate(params.get_expanded_params()): + blocks.append( + ( + f"block{i+1}", + AnyStage( + current_width, + width_out, + stride, + depth, + block_fun, + activation, + group_width, + bottleneck_multiplier, + params, + stage_index=i + 1, + ), + ) + ) + + self.trunk_depth += blocks[-1][1].stage_depth + + current_width = width_out + + self.trunk_output = nn.Sequential(OrderedDict(blocks)) + + # Init weights and good to go + self._init_weights() + + def forward(self, x: Tensor) -> Tensor: + x = self.stem(x) + x = self.trunk_output(x) + + return x + + def _init_weights(self) -> None: + # Performs ResNet-style weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # Note that there is no bias due to BN + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1.0) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + m.weight.data.normal_(mean=0.0, std=0.01) + m.bias.data.zero_() -def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: - model = RegNet() +def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: + model = RegNet(params) if pretrained: if arch not in model_urls: raise ValueError(f"No checkpoint is available for model type {arch}") @@ -65,4 +613,5 @@ def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNe def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - return _regnet("regnet_y_400mf", pretrained, progress, **kwargs) + params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) + return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) From 56352a034748a9b545b54df8b95909eb57b65199 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 14 Sep 2021 06:21:00 +0000 Subject: [PATCH 07/40] nit --- torchvision/models/regnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index eb15da6e0ad..80fed6d6a6e 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -4,7 +4,7 @@ from collections import OrderedDict from enum import Enum, auto -from typing import Any, List, Optional +from typing import Any, List, Optional, Tuple from torch import nn, Tensor from torchvision.models.mobilenetv2 import _make_divisible @@ -131,7 +131,7 @@ def get_expanded_params(self): @staticmethod def _adjust_widths_groups_compatibilty( stage_widths: List[int], bottleneck_ratios: List[float], - group_widths: List[int]) -> Tuple(List[int], List[int]): + group_widths: List[int]) -> Tuple[List[int], List[int]]: """ Adjusts the compatibility of widths and groups, depending on the bottleneck ratio. From ce181c3ef97facacdf6841b87cda208c2e99fbb7 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 14 Sep 2021 23:06:53 +0000 Subject: [PATCH 08/40] add fc layer --- torchvision/models/regnet.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 80fed6d6a6e..5d813d59b9e 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -51,6 +51,7 @@ def __init__( se_ratio: float = 0.25, bn_epsilon: float = 1e-05, bn_momentum: bool = 0.1, + num_classes: int = 1000, ) -> None: if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: raise ValueError("Invalid RegNet settings") @@ -68,6 +69,7 @@ def __init__( self.se_ratio = se_ratio if use_se else None self.bn_epsilon = bn_epsilon self.bn_momentum = bn_momentum + self.num_classes = num_classes def get_expanded_params(self): """ @@ -578,13 +580,20 @@ def __init__(self, params: RegNetParams) -> None: self.trunk_output = nn.Sequential(OrderedDict(blocks)) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes) + # Init weights and good to go self._init_weights() def forward(self, x: Tensor) -> Tensor: x = self.stem(x) x = self.trunk_output(x) - + + x = self.avgpool(x) + x = x.flatten(start_dim=1) + x = self.fc(x) + return x def _init_weights(self) -> None: From a91c32b309334e4e4e4d9084df858c45d45a607d Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 15 Sep 2021 07:19:45 +0000 Subject: [PATCH 09/40] use Callable instead of Enum for block, stem and activation --- torchvision/models/regnet.py | 296 ++++++++++++++++------------------- 1 file changed, 132 insertions(+), 164 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 5d813d59b9e..76d3381fe5b 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -4,150 +4,19 @@ from collections import OrderedDict from enum import Enum, auto -from typing import Any, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple from torch import nn, Tensor + +from .._internally_replaced_utils import load_state_dict_from_url from torchvision.models.mobilenetv2 import _make_divisible model_urls = { + # TODO(kazhang): add pretrained weights + "regnet_y_400m": "", } -# The different possible blocks -class BlockType(Enum): - VANILLA_BLOCK = auto() - RES_BASIC_BLOCK = auto() - RES_BOTTLENECK_BLOCK = auto() - RES_BOTTLENECK_LINEAR_BLOCK = auto() - - -# The different possible Stems -class StemType(Enum): - RES_STEM_CIFAR = auto() - RES_STEM_IN = auto() - SIMPLE_STEM_IN = auto() - - -# The different possible activations -class ActivationType(Enum): - RELU = auto() - SILU = auto() - - -class RegNetParams: - def __init__( - self, - depth: int, - w_0: int, - w_a: float, - w_m: float, - group_width: int, - bottleneck_multiplier: float = 1.0, - stem_type: StemType = StemType.SIMPLE_STEM_IN, - stem_width: int = 32, - block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK, - activation: ActivationType = ActivationType.RELU, - use_se: bool = True, - se_ratio: float = 0.25, - bn_epsilon: float = 1e-05, - bn_momentum: bool = 0.1, - num_classes: int = 1000, - ) -> None: - if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: - raise ValueError("Invalid RegNet settings") - self.depth = depth - self.w_0 = w_0 - self.w_a = w_a - self.w_m = w_m - self.group_width = group_width - self.bottleneck_multiplier = bottleneck_multiplier - self.stem_type = stem_type - self.block_type = block_type - self.activation = activation - self.stem_width = stem_width - self.use_se = use_se - self.se_ratio = se_ratio if use_se else None - self.bn_epsilon = bn_epsilon - self.bn_momentum = bn_momentum - self.num_classes = num_classes - - def get_expanded_params(self): - """ - Programatically compute all the per-block settings, - given the RegNet parameters. - - The first step is to compute the quantized linear block parameters, - in log space. Key parameters are: - - `w_a` is the width progression slope - - `w_0` is the initial width - - `w_m` is the width stepping in the log space - - In other terms - `log(block_width) = log(w_0) + w_m * block_capacity`, - with `bock_capacity` ramping up following the w_0 and w_a params. - This block width is finally quantized to multiples of 8. - - The second step is to compute the parameters per stage, - taking into account the skip connection and the final 1x1 convolutions. - We use the fact that the output width is constant within a stage. - """ - - QUANT = 8 - STRIDE = 2 - - # Compute the block widths. Each stage has one unique block width - widths_cont = np.arange(self.depth) * self.w_a + self.w_0 - block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) - block_widths = ( - np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) - * QUANT - ) - num_stages = len(np.unique(block_widths)) - block_widths = block_widths.astype(int).tolist() - - # Convert to per stage parameters - split_helper = zip( - block_widths + [0], - [0] + block_widths, - block_widths + [0], - [0] + block_widths, - ) - splits = [w != wp or r != rp for w, wp, r, rp in split_helper] - - stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] - stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() - - strides = [STRIDE] * num_stages - bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages - group_widths = [self.group_width] * num_stages - - # Adjust the compatibility of stage widths and group widths - stage_widths, group_widths = self._adjust_widths_groups_compatibilty( - stage_widths, bottleneck_multipliers, group_widths - ) - - return zip( - stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers - ) - - @staticmethod - def _adjust_widths_groups_compatibilty( - stage_widths: List[int], bottleneck_ratios: List[float], - group_widths: List[int]) -> Tuple[List[int], List[int]]: - """ - Adjusts the compatibility of widths and groups, - depending on the bottleneck ratio. - """ - # Compute all widths for the current settings - widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] - group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] - - # Compute the adjusted widths so that stage and group widths fit - ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] - stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] - return stage_widths, group_widths_min - - class _SqueezeExcitation(nn.Module): """ Squeeze and excitation layer from @@ -483,11 +352,13 @@ def __init__( width_out: int, stride: int, depth: int, - block_constructor: nn.Module, + block_constructor: Callable[..., nn.Module], + bn_epsilon: float, + bn_momentum: float, activation: nn.Module, group_width: int, bottleneck_multiplier: float, - params: "AnyNetParams", + se_ratio: Optional[float] = None, stage_index: int = 0, ) -> None: super().__init__() @@ -498,37 +369,140 @@ def __init__( width_in if i == 0 else width_out, width_out, stride if i == 0 else 1, - params.bn_epsilon, - params.bn_momentum, + bn_epsilon, + bn_momentum, activation, group_width, bottleneck_multiplier, - params.se_ratio, + se_ratio, ) self.stage_depth += block.depth self.add_module(f"block{stage_index}-{i}", block) +class RegNetParams: + def __init__( + self, + depth: int, + w_0: int, + w_a: float, + w_m: float, + group_width: int, + bottleneck_multiplier: float = 1.0, + stem_type: Callable[..., nn.Module] = SimpleStemIN, + stem_width: int = 32, + block_type: Callable[..., nn.Module] = ResBottleneckBlock, + activation: Callable[..., nn.Module] = nn.ReLU, + use_se: bool = True, + se_ratio: float = 0.25, + bn_epsilon: float = 1e-05, + bn_momentum: float = 0.1, + num_classes: int = 1000, + ) -> None: + if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: + raise ValueError("Invalid RegNet settings") + self.depth = depth + self.w_0 = w_0 + self.w_a = w_a + self.w_m = w_m + self.group_width = group_width + self.bottleneck_multiplier = bottleneck_multiplier + self.stem_type = stem_type + self.block_type = block_type + self.activation = activation + self.stem_width = stem_width + self.use_se = use_se + self.se_ratio = se_ratio if use_se else None + self.bn_epsilon = bn_epsilon + self.bn_momentum = bn_momentum + self.num_classes = num_classes + + def get_expanded_params(self): + """ + Programatically compute all the per-block settings, + given the RegNet parameters. + + The first step is to compute the quantized linear block parameters, + in log space. Key parameters are: + - `w_a` is the width progression slope + - `w_0` is the initial width + - `w_m` is the width stepping in the log space + + In other terms + `log(block_width) = log(w_0) + w_m * block_capacity`, + with `bock_capacity` ramping up following the w_0 and w_a params. + This block width is finally quantized to multiples of 8. + + The second step is to compute the parameters per stage, + taking into account the skip connection and the final 1x1 convolutions. + We use the fact that the output width is constant within a stage. + """ + + QUANT = 8 + STRIDE = 2 + + # Compute the block widths. Each stage has one unique block width + widths_cont = np.arange(self.depth) * self.w_a + self.w_0 + block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) + block_widths = ( + np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) + * QUANT + ) + num_stages = len(np.unique(block_widths)) + block_widths = block_widths.astype(int).tolist() + + # Convert to per stage parameters + split_helper = zip( + block_widths + [0], + [0] + block_widths, + block_widths + [0], + [0] + block_widths, + ) + splits = [w != wp or r != rp for w, wp, r, rp in split_helper] + + stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] + stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() + + strides = [STRIDE] * num_stages + bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages + group_widths = [self.group_width] * num_stages + + # Adjust the compatibility of stage widths and group widths + stage_widths, group_widths = self._adjust_widths_groups_compatibilty( + stage_widths, bottleneck_multipliers, group_widths + ) + + return zip( + stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers + ) + + @staticmethod + def _adjust_widths_groups_compatibilty( + stage_widths: List[int], bottleneck_ratios: List[float], + group_widths: List[int]) -> Tuple[List[int], List[int]]: + """ + Adjusts the compatibility of widths and groups, + depending on the bottleneck ratio. + """ + # Compute all widths for the current settings + widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] + group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] + + # Compute the adjusted widths so that stage and group widths fit + ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] + stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] + return stage_widths, group_widths_min + + class RegNet(nn.Module): def __init__(self, params: RegNetParams) -> None: super().__init__() - if params.activation == ActivationType.SILU and torch.__version__ < "1.7": - raise ValueError("SiLU activation is only supported since PyTorch 1.7") - - silu = None if torch.__version__ < "1.7" else nn.SiLU() - activation = { - ActivationType.RELU: nn.ReLU(inplace=True), - ActivationType.SILU: silu, - }[params.activation] + activation = params.activation(inplace=True) # Ad hoc stem - self.stem = { - StemType.RES_STEM_CIFAR: ResStemCifar, - StemType.RES_STEM_IN: ResStemIN, - StemType.SIMPLE_STEM_IN: SimpleStemIN, - }[params.stem_type]( + self.stem = params.stem_type( 3, # width_in params.stem_width, params.bn_epsilon, @@ -536,14 +510,6 @@ def __init__(self, params: RegNetParams) -> None: activation, ) - # Instantiate all the AnyNet blocks in the trunk - block_fun = { - BlockType.VANILLA_BLOCK: VanillaBlock, - BlockType.RES_BASIC_BLOCK: ResBasicBlock, - BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock, - BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock, - }[params.block_type] - current_width = params.stem_width self.trunk_depth = 0 @@ -564,11 +530,13 @@ def __init__(self, params: RegNetParams) -> None: width_out, stride, depth, - block_fun, + params.block_type, + params.bn_epsilon, + params.bn_momentum, activation, group_width, bottleneck_multiplier, - params, + params.se_ratio, stage_index=i + 1, ), ) From 0d1601bf6b188cc1c91f9f5bbcfc244e85910ce4 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 15 Sep 2021 21:30:52 +0000 Subject: [PATCH 10/40] add regnet_x and regnet_y model build functions, add docs --- docs/source/models.rst | 48 +++++++ hubconf.py | 4 + references/classification/README.md | 7 + torchvision/models/regnet.py | 193 +++++++++++++++++++++++++++- 4 files changed, 251 insertions(+), 1 deletion(-) diff --git a/docs/source/models.rst b/docs/source/models.rst index 3f31455f9da..be2a007d9ae 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -37,6 +37,7 @@ architectures for image classification: - `Wide ResNet`_ - `MNASNet`_ - `EfficientNet`_ +- `RegNet`_ You can construct a model with random weights by calling its constructor: @@ -65,6 +66,20 @@ You can construct a model with random weights by calling its constructor: efficientnet_b5 = models.efficientnet_b5() efficientnet_b6 = models.efficientnet_b6() efficientnet_b7 = models.efficientnet_b7() + regnet_y_400mf = models.regnet_y_400mf() + regnet_y_800mf = models.regnet_y_800mf() + regnet_y_1_6gf = models.regnet_y_1_6gf() + regnet_y_3_2gf = models.regnet_y_3_2gf() + regnet_y_8gf = models.regnet_y_8gf() + regnet_y_16gf = models.regnet_y_16gf() + regnet_y_32gf = models.regnet_y_32gf() + regnet_x_400mf = models.regnet_x_400mf() + regnet_x_800mf = models.regnet_x_800mf() + regnet_x_1_6gf = models.regnet_x_1_6gf() + regnet_x_3_2gf = models.regnet_x_3_2gf() + regnet_x_8gf = models.regnet_x_8gf() + regnet_x_16gf = models.regnet_x_16gf() + regnet_x_32gf = models.regnet_x_32gf() We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`. These can be constructed by passing ``pretrained=True``: @@ -94,6 +109,20 @@ These can be constructed by passing ``pretrained=True``: efficientnet_b5 = models.efficientnet_b5(pretrained=True) efficientnet_b6 = models.efficientnet_b6(pretrained=True) efficientnet_b7 = models.efficientnet_b7(pretrained=True) + regnet_y_400mf = models.regnet_y_400mf(pretrained=True) + regnet_y_800mf = models.regnet_y_800mf(pretrained=True) + regnet_y_1_6gf = models.regnet_y_1_6gf(pretrained=True) + regnet_y_3_2gf = models.regnet_y_3_2gf(pretrained=True) + regnet_y_8gf = models.regnet_y_8gf(pretrained=True) + regnet_y_16gf = models.regnet_y_16gf(pretrained=True) + regnet_y_32gf = models.regnet_y_32gf(pretrained=True) + regnet_x_400mf = models.regnet_x_400mf(pretrained=True) + regnet_x_800mf = models.regnet_x_800mf(pretrained=True) + regnet_x_1_6gf = models.regnet_x_1_6gf(pretrained=True) + regnet_x_3_2gf = models.regnet_x_3_2gf(pretrained=True) + regnet_x_8gf = models.regnet_x_8gf(pretrained=True) + regnet_x_16gf = models.regnet_x_16gf(pretrained=True) + regnet_x_32gf = models.regnet_x_32gf(pretrained=True) Instancing a pre-trained model will download its weights to a cache directory. This directory can be set using the `TORCH_MODEL_ZOO` environment variable. See @@ -204,6 +233,7 @@ EfficientNet-B7 84.122 96.908 .. _ResNeXt: https://arxiv.org/abs/1611.05431 .. _MNASNet: https://arxiv.org/abs/1807.11626 .. _EfficientNet: https://arxiv.org/abs/1905.11946 +.. _RegNet: https://arxiv.org/abs/2003.13678 .. currentmodule:: torchvision.models @@ -317,6 +347,24 @@ EfficientNet .. autofunction:: efficientnet_b6 .. autofunction:: efficientnet_b7 +EfficientNet +------------ + +.. autofunction:: regnet_y_400mf +.. autofunction:: regnet_y_800mf +.. autofunction:: regnet_y_1_6gf +.. autofunction:: regnet_y_3_2gf +.. autofunction:: regnet_y_8gf +.. autofunction:: regnet_y_16gf +.. autofunction:: regnet_y_32gf +.. autofunction:: regnet_x_400mf +.. autofunction:: regnet_x_800mf +.. autofunction:: regnet_x_1_6gf +.. autofunction:: regnet_x_3_2gf +.. autofunction:: regnet_x_8gf +.. autofunction:: regnet_x_16gf +.. autofunction:: regnet_x_32gf + Quantized Models ---------------- diff --git a/hubconf.py b/hubconf.py index 2bff6850525..8412e9e6e6b 100644 --- a/hubconf.py +++ b/hubconf.py @@ -17,6 +17,10 @@ mnasnet1_3 from torchvision.models.efficientnet import efficientnet_b0, efficientnet_b1, efficientnet_b2, \ efficientnet_b3, efficientnet_b4, efficientnet_b5, efficientnet_b6, efficientnet_b7 +from torchvision.models.regnet import regnet_y_400mf, regnet_y_800mf, \ + regnet_y_1_6gf, regnet_y_3_2gf, regnet_y_8gf, regnet_y_16gf, regnet_y_32gf, \ + regnet_x_400mf, regnet_x_800mf, regnet_x_1_6gf, regnet_x_3_2gf, regnet_x_8gf, \ + regnet_x_16gf, regnet_x_32gf # segmentation from torchvision.models.segmentation import fcn_resnet50, fcn_resnet101, \ diff --git a/references/classification/README.md b/references/classification/README.md index e293f53d0ea..79149758428 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -79,6 +79,13 @@ The weights of the B0-B4 variants are ported from Ross Wightman's [timm repo](ht The weights of the B5-B7 variants are ported from Luke Melas' [EfficientNet-PyTorch repo](https://github.com/lukemelas/EfficientNet-PyTorch/blob/1039e009545d9329ea026c9f7541341439712b96/efficientnet_pytorch/utils.py#L562-L564). + +### RegNet +``` +python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ + --model regnet_y_400mf --epochs 100 +``` + ## Mixed precision training Automatic Mixed Precision (AMP) training on GPU for Pytorch can be enabled with the [NVIDIA Apex extension](https://github.com/NVIDIA/apex). diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 76d3381fe5b..98c60296015 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,3 +1,8 @@ +# Modified from +# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py +# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py + + import numpy as np import math import torch @@ -10,7 +15,6 @@ from .._internally_replaced_utils import load_state_dict_from_url from torchvision.models.mobilenetv2 import _make_divisible - model_urls = { # TODO(kazhang): add pretrained weights "regnet_y_400m": "", @@ -590,5 +594,192 @@ def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, * def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_400MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) + + +def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_800MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs) + return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs) + + +def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_1.6GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs) + return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs) + + +def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_3.2GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs) + return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs) + + +def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_8GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs) + return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs) + + +def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_16GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs) + return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs) + + +def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_32GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs) + return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs) + + +def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_400MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, + group_width=16, use_se=False, **kwargs) + + return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) + + +def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_800MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, + use_se=False, **kwargs) + return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs) + + +def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_1.6GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, + use_se=False, **kwargs) + return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs) + + +def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_3.2GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, + use_se=False, **kwargs) + return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs) + + +def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_8GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, + use_se=False, **kwargs) + return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs) + + +def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_16GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, + use_se=False, **kwargs) + return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs) + + +def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_32GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, + use_se=False, **kwargs) + return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs) + +# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF From 59c5c7e65742d364004e0571816cf341c92e6262 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 16 Sep 2021 23:25:42 +0000 Subject: [PATCH 11/40] remove unused depth --- torchvision/models/regnet.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 98c60296015..d405945267e 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -8,7 +8,6 @@ import torch from collections import OrderedDict -from enum import Enum, auto from typing import Any, Callable, List, Optional, Tuple from torch import nn, Tensor @@ -83,7 +82,6 @@ def __init__( ) self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) - self.depth = 2 class ResStemCifar(nn.Sequential): @@ -103,7 +101,6 @@ def __init__( nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), activation, ) - self.depth = 2 class ResStemIN(nn.Sequential): @@ -124,7 +121,6 @@ def __init__( activation, nn.MaxPool2d(3, stride=2, padding=1), ) - self.depth = 3 class SimpleStemIN(nn.Sequential): @@ -144,7 +140,6 @@ def __init__( nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), activation, ) - self.depth = 2 class VanillaBlock(nn.Sequential): @@ -174,8 +169,6 @@ def __init__( activation, ) - self.depth = 2 - class ResBasicBlock(nn.Module): """Residual basic block: x + F(x), F = basic transform.""" @@ -203,10 +196,6 @@ def __init__( ) self.activation = activation - # The projection and transform happen in parallel, - # and ReLU is not counted with respect to depth - self.depth = self.f.depth - def forward(self, x: Tensor) -> Tensor: if self.proj_block: x = self.bn(self.proj(x)) + self.f(x) @@ -260,7 +249,6 @@ def __init__( self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) - self.depth = 3 if not se_ratio else 4 class ResBottleneckBlock(nn.Module): @@ -302,7 +290,6 @@ def __init__( # The projection and transform happen in parallel, # and activation is not counted with respect to depth - self.depth = self.f.depth def forward(self, x: Tensor) -> Tensor: if self.proj_block: @@ -341,8 +328,6 @@ def __init__( se_ratio, ) - self.depth = self.f.depth - def forward(self, x: Tensor) -> Tensor: return x + self.f(x) if self.has_skip else self.f(x) @@ -366,7 +351,6 @@ def __init__( stage_index: int = 0, ) -> None: super().__init__() - self.stage_depth = 0 for i in range(depth): block = block_constructor( @@ -381,7 +365,6 @@ def __init__( se_ratio, ) - self.stage_depth += block.depth self.add_module(f"block{stage_index}-{i}", block) @@ -516,8 +499,6 @@ def __init__(self, params: RegNetParams) -> None: current_width = params.stem_width - self.trunk_depth = 0 - blocks = [] for i, ( width_out, @@ -546,8 +527,6 @@ def __init__(self, params: RegNetParams) -> None: ) ) - self.trunk_depth += blocks[-1][1].stage_depth - current_width = width_out self.trunk_output = nn.Sequential(OrderedDict(blocks)) @@ -695,7 +674,6 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An """ params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, group_width=16, use_se=False, **kwargs) - return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) From 33ad54e740d765d0927cee952e0096fc38a30877 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Fri, 17 Sep 2021 07:10:50 +0000 Subject: [PATCH 12/40] use BN/activation constructor and ConvBNActivation --- torchvision/models/regnet.py | 187 +++++++++++++---------------------- 1 file changed, 66 insertions(+), 121 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index d405945267e..9c3330ec726 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -8,11 +8,12 @@ import torch from collections import OrderedDict +from functools import partial from typing import Any, Callable, List, Optional, Tuple from torch import nn, Tensor from .._internally_replaced_utils import load_state_dict_from_url -from torchvision.models.mobilenetv2 import _make_divisible +from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible model_urls = { # TODO(kazhang): add pretrained weights @@ -68,39 +69,17 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], ) -> None: - super().__init__() - - self.a = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), - ) - - self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) - - -class ResStemCifar(nn.Sequential): - """ResNet stem for CIFAR: 3x3, BN, ReLU.""" - - def __init__( - self, - width_in: int, - width_out: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, - ) -> None: - super().__init__() - self.stem = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + super().__init__(OrderedDict( + a=nn.Sequential( + ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, + norm_layer=norm_layer, activation_layer=activation_layer), + nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), + ), + final_bn=norm_layer(width_out), + )) class ResStemIN(nn.Sequential): @@ -110,36 +89,28 @@ def __init__( self, width_in: int, width_out: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], ) -> None: - super().__init__() - self.stem = nn.Sequential( - nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, + super().__init__( + ConvBNActivation(width_in, width_out, kernel_size=7, stride=2, + norm_layer=norm_layer, activation_layer=activation_layer), nn.MaxPool2d(3, stride=2, padding=1), ) -class SimpleStemIN(nn.Sequential): +class SimpleStemIN(ConvBNActivation): """Simple stem for ImageNet: 3x3, BN, ReLU.""" def __init__( self, width_in: int, width_out: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], ) -> None: - super().__init__() - self.stem = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + super().__init__(width_in, width_out, kernel_size=3, stride=2, + norm_layer=norm_layer, activation_layer=activation_layer) class VanillaBlock(nn.Sequential): @@ -150,24 +121,17 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], *args, **kwargs, ) -> None: - super().__init__() - self.a = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) - - self.b = nn.Sequential( - nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + super().__init__(OrderedDict( + a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, + norm_layer=norm_layer, activation_layer=activation_layer), + b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1, + norm_layer=norm_layer, activation_layer=activation_layer), + )) class ResBasicBlock(nn.Module): @@ -178,9 +142,8 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], *args, **kwargs, ) -> None: @@ -190,11 +153,11 @@ def __init__( self.proj = nn.Conv2d( width_in, width_out, 1, stride=stride, padding=0, bias=False ) - self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.bn = norm_layer(width_out) self.f = BasicTransform( - width_in, width_out, stride, bn_epsilon, bn_momentum, activation + width_in, width_out, stride, norm_layer, activation_layer ) - self.activation = activation + self.activation = activation_layer(inplace=True) def forward(self, x: Tensor) -> Tensor: if self.proj_block: @@ -213,42 +176,35 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int, bottleneck_multiplier: float, se_ratio: Optional[float], ) -> None: - super().__init__() + layers = OrderedDict() w_b = int(round(width_out * bottleneck_multiplier)) g = w_b // group_width - self.a = nn.Sequential( - nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False), - nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) - - self.b = nn.Sequential( - nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False), - nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + layers["a"] = ConvBNActivation(width_in, w_b, kernel_size=1, stride=1, + norm_layer=norm_layer, activation_layer=activation_layer) + layers["b"] = ConvBNActivation(w_b, w_b, kernel_size=3, stride=stride, groups=g, + norm_layer=norm_layer, activation_layer=activation_layer) if se_ratio: # The SE reduction ratio is defined with respect to the # beginning of the block width_se_out = int(round(se_ratio * width_in)) - self.se = _SqueezeExcitation( + layers["se"] = _SqueezeExcitation( in_channels=w_b, reduction_ratio=None, reduced_channels=width_se_out, - activation=activation, + activation=activation_layer(inplace=True), ) - self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) - self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) + layers["final_bn"] = norm_layer(width_out) + super().__init__(layers) class ResBottleneckBlock(nn.Module): @@ -259,9 +215,8 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int = 1, bottleneck_multiplier: float = 1.0, se_ratio: Optional[float] = None, @@ -274,19 +229,18 @@ def __init__( self.proj = nn.Conv2d( width_in, width_out, 1, stride=stride, padding=0, bias=False ) - self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.bn = norm_layer(width_out) self.f = BottleneckTransform( width_in, width_out, stride, - bn_epsilon, - bn_momentum, - activation, + norm_layer, + activation_layer, group_width, bottleneck_multiplier, se_ratio, ) - self.activation = activation + self.activation = activation_layer(inplace=True) # The projection and transform happen in parallel, # and activation is not counted with respect to depth @@ -307,9 +261,8 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int = 1, bottleneck_multiplier: float = 4.0, se_ratio: Optional[float] = None, @@ -320,9 +273,8 @@ def __init__( width_in, width_out, stride, - bn_epsilon, - bn_momentum, - activation, + norm_layer, + activation_layer, group_width, bottleneck_multiplier, se_ratio, @@ -342,9 +294,8 @@ def __init__( stride: int, depth: int, block_constructor: Callable[..., nn.Module], - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int, bottleneck_multiplier: float, se_ratio: Optional[float] = None, @@ -357,9 +308,8 @@ def __init__( width_in if i == 0 else width_out, width_out, stride if i == 0 else 1, - bn_epsilon, - bn_momentum, - activation, + norm_layer, + activation_layer, group_width, bottleneck_multiplier, se_ratio, @@ -398,11 +348,10 @@ def __init__( self.stem_type = stem_type self.block_type = block_type self.activation = activation + self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum) self.stem_width = stem_width self.use_se = use_se self.se_ratio = se_ratio if use_se else None - self.bn_epsilon = bn_epsilon - self.bn_momentum = bn_momentum self.num_classes = num_classes def get_expanded_params(self): @@ -486,15 +435,12 @@ class RegNet(nn.Module): def __init__(self, params: RegNetParams) -> None: super().__init__() - activation = params.activation(inplace=True) - # Ad hoc stem self.stem = params.stem_type( 3, # width_in params.stem_width, - params.bn_epsilon, - params.bn_momentum, - activation, + params.norm_layer, + params.activation, ) current_width = params.stem_width @@ -516,9 +462,8 @@ def __init__(self, params: RegNetParams) -> None: stride, depth, params.block_type, - params.bn_epsilon, - params.bn_momentum, - activation, + params.norm_layer, + params.activation, group_width, bottleneck_multiplier, params.se_ratio, From 346aba7007a463cbe9a958472be68f7ace35c76c Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Fri, 17 Sep 2021 07:18:51 +0000 Subject: [PATCH 13/40] add expected test pkl files --- .../ModelTester.test_regnet_x_16gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_x_1_6gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_x_32gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_x_3_2gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_x_400mf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_x_800mf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_x_8gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_y_16gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_y_1_6gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_y_32gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_y_3_2gf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_y_400mf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_y_800mf_expect.pkl | Bin 0 -> 939 bytes .../ModelTester.test_regnet_y_8gf_expect.pkl | Bin 0 -> 939 bytes 14 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/expect/ModelTester.test_regnet_x_16gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_x_1_6gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_x_32gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_x_400mf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_x_800mf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_x_8gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_y_16gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_y_1_6gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_y_32gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_y_400mf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_y_800mf_expect.pkl create mode 100644 test/expect/ModelTester.test_regnet_y_8gf_expect.pkl diff --git a/test/expect/ModelTester.test_regnet_x_16gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_16gf_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..30c9debc3e7b9863050deb266fb99bf86fa9e0d8 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66d9fqMedc^4_>$*S&Mf+imwL&DdA)X3}2v4KjNlhSu9YUCKYHojltr8O znpw8kE*8_UopM2TU&4zWR(vO??0wl6VOJl{W_S3`LtCcz4Et0s?%m7(&CKq!!JR!b z)~W5gvyX9K$9l_sZ-T>j?Xh6BQ(h=%*Kj&xPv0yZyFHf@>>BL&_i=UT*iQKWWUoeJ zk;5!yD{gO#Ec+lw@FcUTcx9blyB$pI%P8zz4o|pF0}tzMSyb z>%OFGpV2=JJ5Xr-xM3@E0vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1 zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=VYZ_MT(wl5IR3SM8B3oV3RwRn8`Vnb+QZlTX-eIC^Q%jy%1+Yz;=XhwP{B z;)uIqv!d(D?%R7jZCZRc?J3(Uxu-qhnyABxUbKpwJgeFJp8$gbi?(JKLTAxtkc z3 zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5+~Geo$V^VFMFTHPP8se?B4Ta&5ylrvO8=y1@YT>D6ic6gu&77p!nUr{vIuR zZm=)ew`z~Xz8gP;Z8ja5XnTE%%)V}oueK7m!)$w|IPE#G^oymD@yfjq4kp`1ma*I1 z51nYcVzaBwukPTziG~Gs)50e2O=|gP+jQ-z&92kjdxbuB+9;HY?+q!Kwx`Tp)UM^v z_B|&zy|7KPTyMjyl5Y3n)uFv>U$fXg&z!tRa)OYZLHxwMQk<6iSf6(7dnli~m(TR% z-UvNj+jqPHdqJTkd`E2a31CQpFz(ReXRwBcR#|FMF)$X~oXm*~E~JoyFparDHeZ~V z9?Ar?6@&x489@|0O(Msk07wD_pr=rD-N=68L(%yP$V1kxZ-A~B*;V`~dL@7^gz1Hb zL4Y?Kn+{Zw9J4N5IVdrM0F2%a;WA7DdlKYbHc;MR@PsM=Wr6^2RyL3rGZ2E*L(~EQ D?vDAy literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..bd745ed1f0558bee8b4293ab34291e2dcbbec917 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66ZJ16kC4g`Fo%45Vc#>8e;oXy3UqeaKE*3QRUw1mb$%Hix2HRq$g_IzhJ^% z^MJm+JUpT{uk!8oZnW>*>y+cS_iXIny}tx_?b2_(*gIwRmA&~A!uz&3+U-pV?y_-V z+-!SfhTq=S*2%Wp4sY4Jv`Tr;QcLl@@=v7pYOR*G%m2b@d*ENUt-#z9d(Y%^?=IY3 zzvs@L1$+6GZtP7CDYrd-M8)pD;IF+V4YIbAFJ9eypEb*N(wEbFtF}L|UHxmStt3z2 z-bt+|_AFhUWeWg|;6e&H2-BDgWb?&& z>7h(OTR}L$n-N68( zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@+@!UK>-ld3!Eyo4Myt%#=M``g8W~KQrHY#>ck3OXfuEIr>w}=2%~W%|{m2 zy@hwDSSyAI+6FYU?v+VDY_oyQWf$v;nmtMD-`Y$#sc&gg-6Y^OSkAjZenwJw~ql zHi=&p_pG^jZx1N66lSd#JOK zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5=T9fdymOC4(my(?0en1Zd*l7Vc&gaQR^O#N?zMjT8eu-*QM`qzPWYJ`+E&G z{r4v9*)1V&bGTyRo zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66by9{yiQF+xC7B*k*U+w&VUgyB69wEPiLFGMUZJKX2E*>2(2i$6g26U#!}< z&mr@JZBOGYyBWOsdv_$C+WXL6%Z{U5Vjs(KAv=eq#(VlVPu+Lc>5g4imEiuR+Rb)v z(zou*tjM%C)3n_8M@MM?&4)$%4xC@L?@CneKK{?%cEwh9cI@YV?{i$MvH!2NrQMSE zJNCHTQ?OsKZo5s;*}t|oCaBme$BNh;2=m+LvH$QsgXaSFTqenOWqtAcZEV^1iB00% zXDxhh- zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66eh2Q#LG{I&D6Aitm{ioMYwl`{gdl3$455?BjNKtAE;k=&;`&i^)8+De6Hridsfc= zWG$a&uxHzW1{?cnCw9*W*lIIHam%ji<7aliyVbg<>LK@@!t5BE=bOJ;Cr63fJhNTC zTct93Pr2SBYfxytoMM!70vJ*tj61aW8LZ)pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq z5a7+mrUO+Z$E*ui4ob`*0He1 zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~s9yjrH>eXPfO8R@?lF3$%G9^lMiFyP3`H0}M9*qYU@F_WEqibt!OnZQ%iH zhFKCem7gcrw}>(R)!x}TBkf_*>g^R!)}Sq;Wm4t>TQC~{N5e*scV<;sr|csH|5y;?-Q~4ci@oC zl3UT%@?mUyK%te~rSEV87*ZgNJGA&2tl^ zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK66c|W>uzIp?!B3pkMEftqi?K(I?NHGCx9UZ!ni|=pTQa)T4kw4#lTo_b229~xR62)!ZhXr*?e(c zdMFdnRuB&GW&~02G>IIC0w4(#fSy9pbtC(U4@KuIAP-r$z5%*kWLNQ{=#>Dv5T+Lz z1_9n|Y&uXya?HAL<)Fk20x)_zgv&4q>`9P!*+6-N!4s+glnDa7S=m5h%s>cI4^ayM DYvA~D literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c2daa7514d3fcdb64d80b7febad6a2c6c5907cc7 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK631ho`kv6mE3CV3#o4^uu-Zmyb@=XGxxsthr|;Vxxoe?ymx+wQ;kb2gHz>4}1bXM60EQF@;|?u;25Wd|m8BLH17pF>$(+dGLJBzu)0hin^Tm1T zp-ezqK{&vh5k$e$Byt=IfFw`=dJ09?jqE2r6rHbtJY?Pa2IzW`UB!>0R|4olm|kcY z1bDNt=|C09G3&yWgAy|c!07D|F2f|SCqdq21LX|{PpAS=CJ69mWdn&Z10hH~L@fY{ Ch4%aa literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_regnet_y_400mf_expect.pkl b/test/expect/ModelTester.test_regnet_y_400mf_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..2082c49cab9f735f69745c124436eb53cfda3ec7 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK632p_b+6r&je9y}yY@WKUbXACWbN)`zLhpC&L%dm${4JRva@#|jCI?y=g@hp zN5T`WWjt2xW)M-b*0zw?vunl*>p2#VyK|Xz_lR5;*fp2y`kq;t&U+a4@mrtc(%JK! zE6qkQV6Dx(jaoKh_jC5VEttGpLnePuO>%{e;x?UKJNH!X5%=b|Q5Idl+wiA{O&W{$ zt`C{=dva!Mw$f){-m_n#%SNw$?(R>G{nj~#x2;_zC)rf1+U=Qf>y4F(VB&6uqa1r4 zt9n^&vy|Ou> literal 0 HcmV?d00001 diff --git a/test/expect/ModelTester.test_regnet_y_800mf_expect.pkl b/test/expect/ModelTester.test_regnet_y_800mf_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..a27eefdfd299fc38dad0b600cdb2d35effa3d9e8 GIT binary patch literal 939 zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW zr zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5@-3x&fUJUWj51hGh5yH|7W+pg!Qg7sm{BF_FUh+-&=0C>#VljLLC}5t3-u& zMXLYVDZM{=Px^{UR%YB5yXNxvSfA6lwCk{(=k67!r|v$P%DHDf8;8yAzXiMB%1Ya0 zzA)U~66b5V-86i+k$|MNkxucRZyKw2Ep6@F9n{>gQ*^~FYeRj<-N%+I*@T5Jw0gDm zx7B6)p50H^dbb;FmU`T;5?$F|Au!e_LS!z)+Fc#dL%!v#xq>zI!jk!QJU!0d7 z$^^6(gaf=8K@>bqBFCWsNCE|*r%-g=$bRBO(fJC4k zf)S|3ppZF&8AvA=loqmh8ZvUemW=jY_4CYNO9=M{7L z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w z{zUOK5~sfBmi1;o<2}FDKDL>$%FgESR)O8Y-flLLHJ7YsWPGxTThXv5F7A|d{EIm@ z*ACm*#4ozJNBq~~J-nsHHd2YPHg6vDSzp(X+LJMleNUrM*`C{ndaavIM(ok&yl-Rr zx5GwfO`{Fx%g#L>(;KZfH+9;ae#vc}^$C@O@8`$Q+xc5&U}= zFKOOWo+@So3a!~{Zw*cWLkfg(hZaABH9WM+Qj3a#vEb%pPGoQ)g&c%w%muRf;=J@w zCZMe#9N^6eqTp!~ISvIt5-0#Yg`(?5_7fk9&R0MlvTl6?biK%~;z!Xd0dyfuFEk7S zyxG`vpo-*}b>YfEi5Uc7^mYiBVG`JrAn&q)@& Date: Sun, 19 Sep 2021 22:59:51 +0000 Subject: [PATCH 14/40] allow custom activation in SqueezeExcitation --- torchvision/models/efficientnet.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py index bad5b57b25b..a9f8ac5a7c0 100644 --- a/torchvision/models/efficientnet.py +++ b/torchvision/models/efficientnet.py @@ -36,14 +36,14 @@ def __init__( self, input_channels: int, squeeze_channels: int, - activation: Callable[..., nn.Module] = nn.ReLU, - scale_activation: Callable[..., nn.Module] = nn.Sigmoid, + activation: Optional[Callable[..., nn.Module]] = None, ) -> None: super().__init__() self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1) self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1) - self.activation = activation() - self.scale_activation = scale_activation() + if activation is None: + activation = nn.SiLU + self.activation = activation(inplace=True) def _scale(self, input: Tensor) -> Tensor: scale = F.adaptive_avg_pool2d(input, 1) From e4863079d9ca542e585bcc16eebfa34bb8542f47 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Mon, 20 Sep 2021 20:04:51 +0000 Subject: [PATCH 15/40] use ReLU as the default activation --- torchvision/models/efficientnet.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py index a9f8ac5a7c0..dbfb6bb7dd7 100644 --- a/torchvision/models/efficientnet.py +++ b/torchvision/models/efficientnet.py @@ -36,14 +36,12 @@ def __init__( self, input_channels: int, squeeze_channels: int, - activation: Optional[Callable[..., nn.Module]] = None, + activation: Callable[..., nn.Module] = nn.ReLU, ) -> None: super().__init__() self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1) self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1) - if activation is None: - activation = nn.SiLU - self.activation = activation(inplace=True) + self.activation = activation() def _scale(self, input: Tensor) -> Tensor: scale = F.adaptive_avg_pool2d(input, 1) From 8cab2bbd70484319a50a631f108dc6fd410c54a8 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 00:15:55 +0000 Subject: [PATCH 16/40] initial code --- torchvision/models/regnet.py | 707 +---------------------------------- 1 file changed, 6 insertions(+), 701 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 9c3330ec726..96493ae3c4c 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,708 +1,13 @@ -# Modified from -# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py -# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py - - -import numpy as np -import math -import torch - -from collections import OrderedDict -from functools import partial -from typing import Any, Callable, List, Optional, Tuple -from torch import nn, Tensor - -from .._internally_replaced_utils import load_state_dict_from_url -from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible - -model_urls = { - # TODO(kazhang): add pretrained weights - "regnet_y_400m": "", -} - - -class _SqueezeExcitation(nn.Module): - """ - Squeeze and excitation layer from - `"Squeeze-and-Excitation Networks" `_. - """ - - def __init__( - self, - in_channels: int, - reduction_ratio: Optional[int] = 16, - reduced_channels: Optional[int] = None, - activation: Optional[nn.Module] = None, - ) -> None: - super().__init__() - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - - # Either reduction_ratio is defined, or out_channels is defined, - # neither both nor none of them - assert bool(reduction_ratio) != bool(reduced_channels) - - if activation is None: - activation = nn.ReLU() - - reduced_channels = ( - in_channels // reduction_ratio if reduced_channels is None else reduced_channels - ) - self.excitation = nn.Sequential( - nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), - activation, - nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), - nn.Sigmoid(), - ) - - def forward(self, x: Tensor) -> Tensor: - x_squeezed = self.avgpool(x) - x_excited = self.excitation(x_squeezed) - x_scaled = x * x_excited - return x_scaled - - -class BasicTransform(nn.Sequential): - """Basic transformation: [3x3 conv, BN, Relu] x2.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - ) -> None: - super().__init__(OrderedDict( - a=nn.Sequential( - ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, - norm_layer=norm_layer, activation_layer=activation_layer), - nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), - ), - final_bn=norm_layer(width_out), - )) - - -class ResStemIN(nn.Sequential): - """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool.""" - - def __init__( - self, - width_in: int, - width_out: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - ) -> None: - super().__init__( - ConvBNActivation(width_in, width_out, kernel_size=7, stride=2, - norm_layer=norm_layer, activation_layer=activation_layer), - nn.MaxPool2d(3, stride=2, padding=1), - ) - - -class SimpleStemIN(ConvBNActivation): - """Simple stem for ImageNet: 3x3, BN, ReLU.""" - - def __init__( - self, - width_in: int, - width_out: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - ) -> None: - super().__init__(width_in, width_out, kernel_size=3, stride=2, - norm_layer=norm_layer, activation_layer=activation_layer) - - -class VanillaBlock(nn.Sequential): - """Vanilla block: [3x3 conv, BN, Relu] x2.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - *args, - **kwargs, - ) -> None: - super().__init__(OrderedDict( - a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, - norm_layer=norm_layer, activation_layer=activation_layer), - b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1, - norm_layer=norm_layer, activation_layer=activation_layer), - )) - - -class ResBasicBlock(nn.Module): - """Residual basic block: x + F(x), F = basic transform.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - *args, - **kwargs, - ) -> None: - super().__init__() - self.proj_block = (width_in != width_out) or (stride != 1) - if self.proj_block: - self.proj = nn.Conv2d( - width_in, width_out, 1, stride=stride, padding=0, bias=False - ) - self.bn = norm_layer(width_out) - self.f = BasicTransform( - width_in, width_out, stride, norm_layer, activation_layer - ) - self.activation = activation_layer(inplace=True) - - def forward(self, x: Tensor) -> Tensor: - if self.proj_block: - x = self.bn(self.proj(x)) + self.f(x) - else: - x = x + self.f(x) - - return self.activation(x) - - -class BottleneckTransform(nn.Sequential): - """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - group_width: int, - bottleneck_multiplier: float, - se_ratio: Optional[float], - ) -> None: - layers = OrderedDict() - w_b = int(round(width_out * bottleneck_multiplier)) - g = w_b // group_width - - layers["a"] = ConvBNActivation(width_in, w_b, kernel_size=1, stride=1, - norm_layer=norm_layer, activation_layer=activation_layer) - layers["b"] = ConvBNActivation(w_b, w_b, kernel_size=3, stride=stride, groups=g, - norm_layer=norm_layer, activation_layer=activation_layer) - - if se_ratio: - # The SE reduction ratio is defined with respect to the - # beginning of the block - width_se_out = int(round(se_ratio * width_in)) - layers["se"] = _SqueezeExcitation( - in_channels=w_b, - reduction_ratio=None, - reduced_channels=width_se_out, - activation=activation_layer(inplace=True), - ) - - layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) - layers["final_bn"] = norm_layer(width_out) - super().__init__(layers) - - -class ResBottleneckBlock(nn.Module): - """Residual bottleneck block: x + F(x), F = bottleneck transform.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - group_width: int = 1, - bottleneck_multiplier: float = 1.0, - se_ratio: Optional[float] = None, - ) -> None: - super().__init__() - - # Use skip connection with projection if shape changes - self.proj_block = (width_in != width_out) or (stride != 1) - if self.proj_block: - self.proj = nn.Conv2d( - width_in, width_out, 1, stride=stride, padding=0, bias=False - ) - self.bn = norm_layer(width_out) - self.f = BottleneckTransform( - width_in, - width_out, - stride, - norm_layer, - activation_layer, - group_width, - bottleneck_multiplier, - se_ratio, - ) - self.activation = activation_layer(inplace=True) - - # The projection and transform happen in parallel, - # and activation is not counted with respect to depth - - def forward(self, x: Tensor) -> Tensor: - if self.proj_block: - x = self.bn(self.proj(x)) + self.f(x) - else: - x = x + self.f(x) - return self.activation(x) - - -class ResBottleneckLinearBlock(nn.Module): - """Residual linear bottleneck block: x + F(x), F = bottleneck transform.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - group_width: int = 1, - bottleneck_multiplier: float = 4.0, - se_ratio: Optional[float] = None, - ) -> None: - super().__init__() - self.has_skip = (width_in == width_out) and (stride == 1) - self.f = BottleneckTransform( - width_in, - width_out, - stride, - norm_layer, - activation_layer, - group_width, - bottleneck_multiplier, - se_ratio, - ) - - def forward(self, x: Tensor) -> Tensor: - return x + self.f(x) if self.has_skip else self.f(x) - - -class AnyStage(nn.Sequential): - """AnyNet stage (sequence of blocks w/ the same output shape).""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - depth: int, - block_constructor: Callable[..., nn.Module], - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - group_width: int, - bottleneck_multiplier: float, - se_ratio: Optional[float] = None, - stage_index: int = 0, - ) -> None: - super().__init__() - - for i in range(depth): - block = block_constructor( - width_in if i == 0 else width_out, - width_out, - stride if i == 0 else 1, - norm_layer, - activation_layer, - group_width, - bottleneck_multiplier, - se_ratio, - ) - - self.add_module(f"block{stage_index}-{i}", block) - +from torch import nn class RegNetParams: - def __init__( - self, - depth: int, - w_0: int, - w_a: float, - w_m: float, - group_width: int, - bottleneck_multiplier: float = 1.0, - stem_type: Callable[..., nn.Module] = SimpleStemIN, - stem_width: int = 32, - block_type: Callable[..., nn.Module] = ResBottleneckBlock, - activation: Callable[..., nn.Module] = nn.ReLU, - use_se: bool = True, - se_ratio: float = 0.25, - bn_epsilon: float = 1e-05, - bn_momentum: float = 0.1, - num_classes: int = 1000, - ) -> None: - if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: - raise ValueError("Invalid RegNet settings") - self.depth = depth - self.w_0 = w_0 - self.w_a = w_a - self.w_m = w_m - self.group_width = group_width - self.bottleneck_multiplier = bottleneck_multiplier - self.stem_type = stem_type - self.block_type = block_type - self.activation = activation - self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum) - self.stem_width = stem_width - self.use_se = use_se - self.se_ratio = se_ratio if use_se else None - self.num_classes = num_classes - - def get_expanded_params(self): - """ - Programatically compute all the per-block settings, - given the RegNet parameters. - - The first step is to compute the quantized linear block parameters, - in log space. Key parameters are: - - `w_a` is the width progression slope - - `w_0` is the initial width - - `w_m` is the width stepping in the log space - - In other terms - `log(block_width) = log(w_0) + w_m * block_capacity`, - with `bock_capacity` ramping up following the w_0 and w_a params. - This block width is finally quantized to multiples of 8. - - The second step is to compute the parameters per stage, - taking into account the skip connection and the final 1x1 convolutions. - We use the fact that the output width is constant within a stage. - """ - - QUANT = 8 - STRIDE = 2 - - # Compute the block widths. Each stage has one unique block width - widths_cont = np.arange(self.depth) * self.w_a + self.w_0 - block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) - block_widths = ( - np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) - * QUANT - ) - num_stages = len(np.unique(block_widths)) - block_widths = block_widths.astype(int).tolist() - - # Convert to per stage parameters - split_helper = zip( - block_widths + [0], - [0] + block_widths, - block_widths + [0], - [0] + block_widths, - ) - splits = [w != wp or r != rp for w, wp, r, rp in split_helper] - - stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] - stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() - - strides = [STRIDE] * num_stages - bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages - group_widths = [self.group_width] * num_stages - - # Adjust the compatibility of stage widths and group widths - stage_widths, group_widths = self._adjust_widths_groups_compatibilty( - stage_widths, bottleneck_multipliers, group_widths - ) - - return zip( - stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers - ) - - @staticmethod - def _adjust_widths_groups_compatibilty( - stage_widths: List[int], bottleneck_ratios: List[float], - group_widths: List[int]) -> Tuple[List[int], List[int]]: - """ - Adjusts the compatibility of widths and groups, - depending on the bottleneck ratio. - """ - # Compute all widths for the current settings - widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] - group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] - - # Compute the adjusted widths so that stage and group widths fit - ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] - stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] - return stage_widths, group_widths_min + pass +class SqueezeExcitation(nn.Module): + pass class RegNet(nn.Module): - def __init__(self, params: RegNetParams) -> None: - super().__init__() - - # Ad hoc stem - self.stem = params.stem_type( - 3, # width_in - params.stem_width, - params.norm_layer, - params.activation, - ) - - current_width = params.stem_width - - blocks = [] - for i, ( - width_out, - stride, - depth, - group_width, - bottleneck_multiplier, - ) in enumerate(params.get_expanded_params()): - blocks.append( - ( - f"block{i+1}", - AnyStage( - current_width, - width_out, - stride, - depth, - params.block_type, - params.norm_layer, - params.activation, - group_width, - bottleneck_multiplier, - params.se_ratio, - stage_index=i + 1, - ), - ) - ) - - current_width = width_out - - self.trunk_output = nn.Sequential(OrderedDict(blocks)) - - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes) - - # Init weights and good to go - self._init_weights() - - def forward(self, x: Tensor) -> Tensor: - x = self.stem(x) - x = self.trunk_output(x) - - x = self.avgpool(x) - x = x.flatten(start_dim=1) - x = self.fc(x) - - return x - - def _init_weights(self) -> None: - # Performs ResNet-style weight initialization - for m in self.modules(): - if isinstance(m, nn.Conv2d): - # Note that there is no bias due to BN - fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out)) - elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1.0) - m.bias.data.zero_() - elif isinstance(m, nn.Linear): - m.weight.data.normal_(mean=0.0, std=0.01) - m.bias.data.zero_() - - -def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: - model = RegNet(params) - if pretrained: - if arch not in model_urls: - raise ValueError(f"No checkpoint is available for model type {arch}") - state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) - model.load_state_dict(state_dict) - return model - + pass def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetY_400MF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) - return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) - - -def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetY_800MF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs) - return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs) - - -def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetY_1.6GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs) - return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs) - - -def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetY_3.2GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs) - return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs) - - -def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetY_8GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs) - return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs) - - -def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetY_16GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs) - return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs) - - -def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetY_32GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs) - return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs) - - -def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetX_400MF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, - group_width=16, use_se=False, **kwargs) - return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) - - -def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetX_800MF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, - use_se=False, **kwargs) - return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs) - - -def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetX_1.6GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, - use_se=False, **kwargs) - return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs) - - -def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetX_3.2GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, - use_se=False, **kwargs) - return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs) - - -def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetX_8GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, - use_se=False, **kwargs) - return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs) - - -def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetX_16GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, - use_se=False, **kwargs) - return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs) - - -def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - """ - Constructs a RegNetX_32GF architecture from - `"Designing Network Design Spaces" `_. - - Args: - pretrained (bool): If True, returns a model pre-trained on ImageNet - progress (bool): If True, displays a progress bar of the download to stderr - """ - params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, - use_se=False, **kwargs) - return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs) - -# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF + pass From 12b9d72885e068a3a8dca7f70b4c293777f608a2 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 05:08:36 +0000 Subject: [PATCH 17/40] add SqueezeExcitation --- torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 96493ae3c4c..eff8273695a 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,10 +1,46 @@ -from torch import nn +from torch import nn, Tenspr class RegNetParams: pass class SqueezeExcitation(nn.Module): - pass + """ + Squeeze and excitation layer from + `"Squeeze-and-Excitation Networks" `_. + """ + + def __init__( + self, + in_channels: int, + reduction_ratio: Optional[int] = 16, + reduced_channels: Optional[int] = None, + activation: Optional[nn.Module] = None, + ) -> None: + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + + # Either reduction_ratio is defined, or out_channels is defined, + # neither both nor none of them + assert bool(reduction_ratio) != bool(reduced_channels) + + if activation is None: + activation = nn.ReLU() + + reduced_channels = ( + in_channels // reduction_ratio if reduced_channels is None else reduced_channels + ) + self.excitation = nn.Sequential( + nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), + activation, + nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), + nn.Sigmoid(), + ) + + def forward(self, x: Tensor) -> Tensor: + x_squeezed = self.avgpool(x) + x_excited = self.excitation(x_squeezed) + x_scaled = x * x_excited + return x_scaled class RegNet(nn.Module): pass From 89fbb2b95ed6cec1e6bb365afd123745a952ae9d Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 00:15:55 +0000 Subject: [PATCH 18/40] initial code --- torchvision/models/regnet.py | 40 ++---------------------------------- 1 file changed, 2 insertions(+), 38 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index eff8273695a..96493ae3c4c 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,46 +1,10 @@ -from torch import nn, Tenspr +from torch import nn class RegNetParams: pass class SqueezeExcitation(nn.Module): - """ - Squeeze and excitation layer from - `"Squeeze-and-Excitation Networks" `_. - """ - - def __init__( - self, - in_channels: int, - reduction_ratio: Optional[int] = 16, - reduced_channels: Optional[int] = None, - activation: Optional[nn.Module] = None, - ) -> None: - super().__init__() - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - - # Either reduction_ratio is defined, or out_channels is defined, - # neither both nor none of them - assert bool(reduction_ratio) != bool(reduced_channels) - - if activation is None: - activation = nn.ReLU() - - reduced_channels = ( - in_channels // reduction_ratio if reduced_channels is None else reduced_channels - ) - self.excitation = nn.Sequential( - nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), - activation, - nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), - nn.Sigmoid(), - ) - - def forward(self, x: Tensor) -> Tensor: - x_squeezed = self.avgpool(x) - x_excited = self.excitation(x_squeezed) - x_scaled = x * x_excited - return x_scaled + pass class RegNet(nn.Module): pass From df4890387c7deca666721ccdcf881050ffa4871e Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 9 Sep 2021 05:08:36 +0000 Subject: [PATCH 19/40] add SqueezeExcitation --- torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 96493ae3c4c..eff8273695a 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,10 +1,46 @@ -from torch import nn +from torch import nn, Tenspr class RegNetParams: pass class SqueezeExcitation(nn.Module): - pass + """ + Squeeze and excitation layer from + `"Squeeze-and-Excitation Networks" `_. + """ + + def __init__( + self, + in_channels: int, + reduction_ratio: Optional[int] = 16, + reduced_channels: Optional[int] = None, + activation: Optional[nn.Module] = None, + ) -> None: + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + + # Either reduction_ratio is defined, or out_channels is defined, + # neither both nor none of them + assert bool(reduction_ratio) != bool(reduced_channels) + + if activation is None: + activation = nn.ReLU() + + reduced_channels = ( + in_channels // reduction_ratio if reduced_channels is None else reduced_channels + ) + self.excitation = nn.Sequential( + nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), + activation, + nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), + nn.Sigmoid(), + ) + + def forward(self, x: Tensor) -> Tensor: + x_squeezed = self.avgpool(x) + x_excited = self.excitation(x_squeezed) + x_scaled = x * x_excited + return x_scaled class RegNet(nn.Module): pass From d71014c51f9b40e062a987cffc80d15e80ec39b4 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Fri, 10 Sep 2021 23:21:17 +0000 Subject: [PATCH 20/40] add SqueezeExcitation --- torchvision/models/regnet.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index eff8273695a..aa41e3096c2 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,11 +1,18 @@ from torch import nn, Tenspr +from torchvision.models.mobilenetv2 import _make_divisible + + +model_urls = { +} + class RegNetParams: pass + class SqueezeExcitation(nn.Module): """ - Squeeze and excitation layer from + Squeeze and excitation layer from `"Squeeze-and-Excitation Networks" `_. """ @@ -42,8 +49,20 @@ def forward(self, x: Tensor) -> Tensor: x_scaled = x * x_excited return x_scaled + class RegNet(nn.Module): pass + +def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: + model = RegNet() + if pretrained: + if arch not in model_urls: + raise ValueError(f"No checkpoint is available for model type {arch}") + state_dict = load_state_dict_from_url(model_urls[arch], progress=progress) + model.load_state_dict(state_dict) + return model + + def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - pass + return _regnet("regnet_y_400mf", pretrained, progress, **kwargs) From b440ae4828642a72b879590229198fcf3a3de0cf Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 14 Sep 2021 05:57:30 +0000 Subject: [PATCH 21/40] regnet blocks, stems and model definition --- torchvision/models/regnet.py | 563 ++++++++++++++++++++++++++++++++++- 1 file changed, 556 insertions(+), 7 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index aa41e3096c2..eb15da6e0ad 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,4 +1,11 @@ -from torch import nn, Tenspr +import numpy as np +import math +import torch + +from collections import OrderedDict +from enum import Enum, auto +from typing import Any, List, Optional +from torch import nn, Tensor from torchvision.models.mobilenetv2 import _make_divisible @@ -6,11 +13,140 @@ } +# The different possible blocks +class BlockType(Enum): + VANILLA_BLOCK = auto() + RES_BASIC_BLOCK = auto() + RES_BOTTLENECK_BLOCK = auto() + RES_BOTTLENECK_LINEAR_BLOCK = auto() + + +# The different possible Stems +class StemType(Enum): + RES_STEM_CIFAR = auto() + RES_STEM_IN = auto() + SIMPLE_STEM_IN = auto() + + +# The different possible activations +class ActivationType(Enum): + RELU = auto() + SILU = auto() + + class RegNetParams: - pass + def __init__( + self, + depth: int, + w_0: int, + w_a: float, + w_m: float, + group_width: int, + bottleneck_multiplier: float = 1.0, + stem_type: StemType = StemType.SIMPLE_STEM_IN, + stem_width: int = 32, + block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK, + activation: ActivationType = ActivationType.RELU, + use_se: bool = True, + se_ratio: float = 0.25, + bn_epsilon: float = 1e-05, + bn_momentum: bool = 0.1, + ) -> None: + if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: + raise ValueError("Invalid RegNet settings") + self.depth = depth + self.w_0 = w_0 + self.w_a = w_a + self.w_m = w_m + self.group_width = group_width + self.bottleneck_multiplier = bottleneck_multiplier + self.stem_type = stem_type + self.block_type = block_type + self.activation = activation + self.stem_width = stem_width + self.use_se = use_se + self.se_ratio = se_ratio if use_se else None + self.bn_epsilon = bn_epsilon + self.bn_momentum = bn_momentum + + def get_expanded_params(self): + """ + Programatically compute all the per-block settings, + given the RegNet parameters. + + The first step is to compute the quantized linear block parameters, + in log space. Key parameters are: + - `w_a` is the width progression slope + - `w_0` is the initial width + - `w_m` is the width stepping in the log space + + In other terms + `log(block_width) = log(w_0) + w_m * block_capacity`, + with `bock_capacity` ramping up following the w_0 and w_a params. + This block width is finally quantized to multiples of 8. + + The second step is to compute the parameters per stage, + taking into account the skip connection and the final 1x1 convolutions. + We use the fact that the output width is constant within a stage. + """ + + QUANT = 8 + STRIDE = 2 + + # Compute the block widths. Each stage has one unique block width + widths_cont = np.arange(self.depth) * self.w_a + self.w_0 + block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) + block_widths = ( + np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) + * QUANT + ) + num_stages = len(np.unique(block_widths)) + block_widths = block_widths.astype(int).tolist() + + # Convert to per stage parameters + split_helper = zip( + block_widths + [0], + [0] + block_widths, + block_widths + [0], + [0] + block_widths, + ) + splits = [w != wp or r != rp for w, wp, r, rp in split_helper] + + stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] + stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() + strides = [STRIDE] * num_stages + bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages + group_widths = [self.group_width] * num_stages -class SqueezeExcitation(nn.Module): + # Adjust the compatibility of stage widths and group widths + stage_widths, group_widths = self._adjust_widths_groups_compatibilty( + stage_widths, bottleneck_multipliers, group_widths + ) + + return zip( + stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers + ) + + @staticmethod + def _adjust_widths_groups_compatibilty( + stage_widths: List[int], bottleneck_ratios: List[float], + group_widths: List[int]) -> Tuple(List[int], List[int]): + """ + Adjusts the compatibility of widths and groups, + depending on the bottleneck ratio. + """ + # Compute all widths for the current settings + widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] + group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] + + # Compute the adjusted widths so that stage and group widths fit + ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] + stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] + return stage_widths, group_widths_min + + +class _SqueezeExcitation(nn.Module): """ Squeeze and excitation layer from `"Squeeze-and-Excitation Networks" `_. @@ -50,12 +186,424 @@ def forward(self, x: Tensor) -> Tensor: return x_scaled +class BasicTransform(nn.Sequential): + """Basic transformation: [3x3 conv, BN, Relu] x2.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + + self.a = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), + ) + + self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.depth = 2 + + +class ResStemCifar(nn.Sequential): + """ResNet stem for CIFAR: 3x3, BN, ReLU.""" + + def __init__( + self, + width_in: int, + width_out: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + self.depth = 2 + + +class ResStemIN(nn.Sequential): + """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool.""" + + def __init__( + self, + width_in: int, + width_out: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + nn.MaxPool2d(3, stride=2, padding=1), + ) + self.depth = 3 + + +class SimpleStemIN(nn.Sequential): + """Simple stem for ImageNet: 3x3, BN, ReLU.""" + + def __init__( + self, + width_in: int, + width_out: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + ) -> None: + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + self.depth = 2 + + +class VanillaBlock(nn.Sequential): + """Vanilla block: [3x3 conv, BN, Relu] x2.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + *args, + **kwargs, + ) -> None: + super().__init__() + self.a = nn.Sequential( + nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + self.b = nn.Sequential( + nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), + nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + self.depth = 2 + + +class ResBasicBlock(nn.Module): + """Residual basic block: x + F(x), F = basic transform.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + *args, + **kwargs, + ) -> None: + super().__init__() + self.proj_block = (width_in != width_out) or (stride != 1) + if self.proj_block: + self.proj = nn.Conv2d( + width_in, width_out, 1, stride=stride, padding=0, bias=False + ) + self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.f = BasicTransform( + width_in, width_out, stride, bn_epsilon, bn_momentum, activation + ) + self.activation = activation + + # The projection and transform happen in parallel, + # and ReLU is not counted with respect to depth + self.depth = self.f.depth + + def forward(self, x: Tensor) -> Tensor: + if self.proj_block: + x = self.bn(self.proj(x)) + self.f(x) + else: + x = x + self.f(x) + + return self.activation(x) + + +class BottleneckTransform(nn.Sequential): + """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + group_width: int, + bottleneck_multiplier: float, + se_ratio: Optional[float], + ) -> None: + super().__init__() + w_b = int(round(width_out * bottleneck_multiplier)) + g = w_b // group_width + + self.a = nn.Sequential( + nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False), + nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + self.b = nn.Sequential( + nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False), + nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), + activation, + ) + + if se_ratio: + # The SE reduction ratio is defined with respect to the + # beginning of the block + width_se_out = int(round(se_ratio * width_in)) + self.se = _SqueezeExcitation( + in_channels=w_b, + reduction_ratio=None, + reduced_channels=width_se_out, + activation=activation, + ) + + self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) + self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.depth = 3 if not se_ratio else 4 + + +class ResBottleneckBlock(nn.Module): + """Residual bottleneck block: x + F(x), F = bottleneck transform.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + group_width: int = 1, + bottleneck_multiplier: float = 1.0, + se_ratio: Optional[float] = None, + ) -> None: + super().__init__() + + # Use skip connection with projection if shape changes + self.proj_block = (width_in != width_out) or (stride != 1) + if self.proj_block: + self.proj = nn.Conv2d( + width_in, width_out, 1, stride=stride, padding=0, bias=False + ) + self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.f = BottleneckTransform( + width_in, + width_out, + stride, + bn_epsilon, + bn_momentum, + activation, + group_width, + bottleneck_multiplier, + se_ratio, + ) + self.activation = activation + + # The projection and transform happen in parallel, + # and activation is not counted with respect to depth + self.depth = self.f.depth + + def forward(self, x: Tensor) -> Tensor: + if self.proj_block: + x = self.bn(self.proj(x)) + self.f(x) + else: + x = x + self.f(x) + return self.activation(x) + + +class ResBottleneckLinearBlock(nn.Module): + """Residual linear bottleneck block: x + F(x), F = bottleneck transform.""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + bn_epsilon: float, + bn_momentum: float, + activation: nn.Module, + group_width: int = 1, + bottleneck_multiplier: float = 4.0, + se_ratio: Optional[float] = None, + ) -> None: + super().__init__() + self.has_skip = (width_in == width_out) and (stride == 1) + self.f = BottleneckTransform( + width_in, + width_out, + stride, + bn_epsilon, + bn_momentum, + activation, + group_width, + bottleneck_multiplier, + se_ratio, + ) + + self.depth = self.f.depth + + def forward(self, x: Tensor) -> Tensor: + return x + self.f(x) if self.has_skip else self.f(x) + + +class AnyStage(nn.Sequential): + """AnyNet stage (sequence of blocks w/ the same output shape).""" + + def __init__( + self, + width_in: int, + width_out: int, + stride: int, + depth: int, + block_constructor: nn.Module, + activation: nn.Module, + group_width: int, + bottleneck_multiplier: float, + params: "AnyNetParams", + stage_index: int = 0, + ) -> None: + super().__init__() + self.stage_depth = 0 + + for i in range(depth): + block = block_constructor( + width_in if i == 0 else width_out, + width_out, + stride if i == 0 else 1, + params.bn_epsilon, + params.bn_momentum, + activation, + group_width, + bottleneck_multiplier, + params.se_ratio, + ) + + self.stage_depth += block.depth + self.add_module(f"block{stage_index}-{i}", block) + + class RegNet(nn.Module): - pass + def __init__(self, params: RegNetParams) -> None: + super().__init__() + + if params.activation == ActivationType.SILU and torch.__version__ < "1.7": + raise ValueError("SiLU activation is only supported since PyTorch 1.7") + + silu = None if torch.__version__ < "1.7" else nn.SiLU() + activation = { + ActivationType.RELU: nn.ReLU(inplace=True), + ActivationType.SILU: silu, + }[params.activation] + + # Ad hoc stem + self.stem = { + StemType.RES_STEM_CIFAR: ResStemCifar, + StemType.RES_STEM_IN: ResStemIN, + StemType.SIMPLE_STEM_IN: SimpleStemIN, + }[params.stem_type]( + 3, # width_in + params.stem_width, + params.bn_epsilon, + params.bn_momentum, + activation, + ) + + # Instantiate all the AnyNet blocks in the trunk + block_fun = { + BlockType.VANILLA_BLOCK: VanillaBlock, + BlockType.RES_BASIC_BLOCK: ResBasicBlock, + BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock, + BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock, + }[params.block_type] + + current_width = params.stem_width + + self.trunk_depth = 0 + + blocks = [] + for i, ( + width_out, + stride, + depth, + group_width, + bottleneck_multiplier, + ) in enumerate(params.get_expanded_params()): + blocks.append( + ( + f"block{i+1}", + AnyStage( + current_width, + width_out, + stride, + depth, + block_fun, + activation, + group_width, + bottleneck_multiplier, + params, + stage_index=i + 1, + ), + ) + ) + + self.trunk_depth += blocks[-1][1].stage_depth + + current_width = width_out + + self.trunk_output = nn.Sequential(OrderedDict(blocks)) + + # Init weights and good to go + self._init_weights() + + def forward(self, x: Tensor) -> Tensor: + x = self.stem(x) + x = self.trunk_output(x) + + return x + + def _init_weights(self) -> None: + # Performs ResNet-style weight initialization + for m in self.modules(): + if isinstance(m, nn.Conv2d): + # Note that there is no bias due to BN + fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels + m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out)) + elif isinstance(m, nn.BatchNorm2d): + m.weight.data.fill_(1.0) + m.bias.data.zero_() + elif isinstance(m, nn.Linear): + m.weight.data.normal_(mean=0.0, std=0.01) + m.bias.data.zero_() -def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: - model = RegNet() +def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: + model = RegNet(params) if pretrained: if arch not in model_urls: raise ValueError(f"No checkpoint is available for model type {arch}") @@ -65,4 +613,5 @@ def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNe def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: - return _regnet("regnet_y_400mf", pretrained, progress, **kwargs) + params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) + return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) From 0dc5bc8549751af75456a503c327c8c1937e8d09 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 14 Sep 2021 06:21:00 +0000 Subject: [PATCH 22/40] nit --- torchvision/models/regnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index eb15da6e0ad..80fed6d6a6e 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -4,7 +4,7 @@ from collections import OrderedDict from enum import Enum, auto -from typing import Any, List, Optional +from typing import Any, List, Optional, Tuple from torch import nn, Tensor from torchvision.models.mobilenetv2 import _make_divisible @@ -131,7 +131,7 @@ def get_expanded_params(self): @staticmethod def _adjust_widths_groups_compatibilty( stage_widths: List[int], bottleneck_ratios: List[float], - group_widths: List[int]) -> Tuple(List[int], List[int]): + group_widths: List[int]) -> Tuple[List[int], List[int]]: """ Adjusts the compatibility of widths and groups, depending on the bottleneck ratio. From e02d886c1224dbd953d962a5e87c62261888284f Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 14 Sep 2021 23:06:53 +0000 Subject: [PATCH 23/40] add fc layer --- torchvision/models/regnet.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 80fed6d6a6e..5d813d59b9e 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -51,6 +51,7 @@ def __init__( se_ratio: float = 0.25, bn_epsilon: float = 1e-05, bn_momentum: bool = 0.1, + num_classes: int = 1000, ) -> None: if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: raise ValueError("Invalid RegNet settings") @@ -68,6 +69,7 @@ def __init__( self.se_ratio = se_ratio if use_se else None self.bn_epsilon = bn_epsilon self.bn_momentum = bn_momentum + self.num_classes = num_classes def get_expanded_params(self): """ @@ -578,13 +580,20 @@ def __init__(self, params: RegNetParams) -> None: self.trunk_output = nn.Sequential(OrderedDict(blocks)) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes) + # Init weights and good to go self._init_weights() def forward(self, x: Tensor) -> Tensor: x = self.stem(x) x = self.trunk_output(x) - + + x = self.avgpool(x) + x = x.flatten(start_dim=1) + x = self.fc(x) + return x def _init_weights(self) -> None: From 5a6c7294032b5fca9d326c5b01df5637b1c4a0aa Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 15 Sep 2021 07:19:45 +0000 Subject: [PATCH 24/40] use Callable instead of Enum for block, stem and activation --- torchvision/models/regnet.py | 296 ++++++++++++++++------------------- 1 file changed, 132 insertions(+), 164 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 5d813d59b9e..76d3381fe5b 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -4,150 +4,19 @@ from collections import OrderedDict from enum import Enum, auto -from typing import Any, List, Optional, Tuple +from typing import Any, Callable, List, Optional, Tuple from torch import nn, Tensor + +from .._internally_replaced_utils import load_state_dict_from_url from torchvision.models.mobilenetv2 import _make_divisible model_urls = { + # TODO(kazhang): add pretrained weights + "regnet_y_400m": "", } -# The different possible blocks -class BlockType(Enum): - VANILLA_BLOCK = auto() - RES_BASIC_BLOCK = auto() - RES_BOTTLENECK_BLOCK = auto() - RES_BOTTLENECK_LINEAR_BLOCK = auto() - - -# The different possible Stems -class StemType(Enum): - RES_STEM_CIFAR = auto() - RES_STEM_IN = auto() - SIMPLE_STEM_IN = auto() - - -# The different possible activations -class ActivationType(Enum): - RELU = auto() - SILU = auto() - - -class RegNetParams: - def __init__( - self, - depth: int, - w_0: int, - w_a: float, - w_m: float, - group_width: int, - bottleneck_multiplier: float = 1.0, - stem_type: StemType = StemType.SIMPLE_STEM_IN, - stem_width: int = 32, - block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK, - activation: ActivationType = ActivationType.RELU, - use_se: bool = True, - se_ratio: float = 0.25, - bn_epsilon: float = 1e-05, - bn_momentum: bool = 0.1, - num_classes: int = 1000, - ) -> None: - if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: - raise ValueError("Invalid RegNet settings") - self.depth = depth - self.w_0 = w_0 - self.w_a = w_a - self.w_m = w_m - self.group_width = group_width - self.bottleneck_multiplier = bottleneck_multiplier - self.stem_type = stem_type - self.block_type = block_type - self.activation = activation - self.stem_width = stem_width - self.use_se = use_se - self.se_ratio = se_ratio if use_se else None - self.bn_epsilon = bn_epsilon - self.bn_momentum = bn_momentum - self.num_classes = num_classes - - def get_expanded_params(self): - """ - Programatically compute all the per-block settings, - given the RegNet parameters. - - The first step is to compute the quantized linear block parameters, - in log space. Key parameters are: - - `w_a` is the width progression slope - - `w_0` is the initial width - - `w_m` is the width stepping in the log space - - In other terms - `log(block_width) = log(w_0) + w_m * block_capacity`, - with `bock_capacity` ramping up following the w_0 and w_a params. - This block width is finally quantized to multiples of 8. - - The second step is to compute the parameters per stage, - taking into account the skip connection and the final 1x1 convolutions. - We use the fact that the output width is constant within a stage. - """ - - QUANT = 8 - STRIDE = 2 - - # Compute the block widths. Each stage has one unique block width - widths_cont = np.arange(self.depth) * self.w_a + self.w_0 - block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) - block_widths = ( - np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) - * QUANT - ) - num_stages = len(np.unique(block_widths)) - block_widths = block_widths.astype(int).tolist() - - # Convert to per stage parameters - split_helper = zip( - block_widths + [0], - [0] + block_widths, - block_widths + [0], - [0] + block_widths, - ) - splits = [w != wp or r != rp for w, wp, r, rp in split_helper] - - stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] - stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() - - strides = [STRIDE] * num_stages - bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages - group_widths = [self.group_width] * num_stages - - # Adjust the compatibility of stage widths and group widths - stage_widths, group_widths = self._adjust_widths_groups_compatibilty( - stage_widths, bottleneck_multipliers, group_widths - ) - - return zip( - stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers - ) - - @staticmethod - def _adjust_widths_groups_compatibilty( - stage_widths: List[int], bottleneck_ratios: List[float], - group_widths: List[int]) -> Tuple[List[int], List[int]]: - """ - Adjusts the compatibility of widths and groups, - depending on the bottleneck ratio. - """ - # Compute all widths for the current settings - widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] - group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] - - # Compute the adjusted widths so that stage and group widths fit - ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] - stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] - return stage_widths, group_widths_min - - class _SqueezeExcitation(nn.Module): """ Squeeze and excitation layer from @@ -483,11 +352,13 @@ def __init__( width_out: int, stride: int, depth: int, - block_constructor: nn.Module, + block_constructor: Callable[..., nn.Module], + bn_epsilon: float, + bn_momentum: float, activation: nn.Module, group_width: int, bottleneck_multiplier: float, - params: "AnyNetParams", + se_ratio: Optional[float] = None, stage_index: int = 0, ) -> None: super().__init__() @@ -498,37 +369,140 @@ def __init__( width_in if i == 0 else width_out, width_out, stride if i == 0 else 1, - params.bn_epsilon, - params.bn_momentum, + bn_epsilon, + bn_momentum, activation, group_width, bottleneck_multiplier, - params.se_ratio, + se_ratio, ) self.stage_depth += block.depth self.add_module(f"block{stage_index}-{i}", block) +class RegNetParams: + def __init__( + self, + depth: int, + w_0: int, + w_a: float, + w_m: float, + group_width: int, + bottleneck_multiplier: float = 1.0, + stem_type: Callable[..., nn.Module] = SimpleStemIN, + stem_width: int = 32, + block_type: Callable[..., nn.Module] = ResBottleneckBlock, + activation: Callable[..., nn.Module] = nn.ReLU, + use_se: bool = True, + se_ratio: float = 0.25, + bn_epsilon: float = 1e-05, + bn_momentum: float = 0.1, + num_classes: int = 1000, + ) -> None: + if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: + raise ValueError("Invalid RegNet settings") + self.depth = depth + self.w_0 = w_0 + self.w_a = w_a + self.w_m = w_m + self.group_width = group_width + self.bottleneck_multiplier = bottleneck_multiplier + self.stem_type = stem_type + self.block_type = block_type + self.activation = activation + self.stem_width = stem_width + self.use_se = use_se + self.se_ratio = se_ratio if use_se else None + self.bn_epsilon = bn_epsilon + self.bn_momentum = bn_momentum + self.num_classes = num_classes + + def get_expanded_params(self): + """ + Programatically compute all the per-block settings, + given the RegNet parameters. + + The first step is to compute the quantized linear block parameters, + in log space. Key parameters are: + - `w_a` is the width progression slope + - `w_0` is the initial width + - `w_m` is the width stepping in the log space + + In other terms + `log(block_width) = log(w_0) + w_m * block_capacity`, + with `bock_capacity` ramping up following the w_0 and w_a params. + This block width is finally quantized to multiples of 8. + + The second step is to compute the parameters per stage, + taking into account the skip connection and the final 1x1 convolutions. + We use the fact that the output width is constant within a stage. + """ + + QUANT = 8 + STRIDE = 2 + + # Compute the block widths. Each stage has one unique block width + widths_cont = np.arange(self.depth) * self.w_a + self.w_0 + block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) + block_widths = ( + np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) + * QUANT + ) + num_stages = len(np.unique(block_widths)) + block_widths = block_widths.astype(int).tolist() + + # Convert to per stage parameters + split_helper = zip( + block_widths + [0], + [0] + block_widths, + block_widths + [0], + [0] + block_widths, + ) + splits = [w != wp or r != rp for w, wp, r, rp in split_helper] + + stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] + stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() + + strides = [STRIDE] * num_stages + bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages + group_widths = [self.group_width] * num_stages + + # Adjust the compatibility of stage widths and group widths + stage_widths, group_widths = self._adjust_widths_groups_compatibilty( + stage_widths, bottleneck_multipliers, group_widths + ) + + return zip( + stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers + ) + + @staticmethod + def _adjust_widths_groups_compatibilty( + stage_widths: List[int], bottleneck_ratios: List[float], + group_widths: List[int]) -> Tuple[List[int], List[int]]: + """ + Adjusts the compatibility of widths and groups, + depending on the bottleneck ratio. + """ + # Compute all widths for the current settings + widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)] + group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)] + + # Compute the adjusted widths so that stage and group widths fit + ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)] + stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)] + return stage_widths, group_widths_min + + class RegNet(nn.Module): def __init__(self, params: RegNetParams) -> None: super().__init__() - if params.activation == ActivationType.SILU and torch.__version__ < "1.7": - raise ValueError("SiLU activation is only supported since PyTorch 1.7") - - silu = None if torch.__version__ < "1.7" else nn.SiLU() - activation = { - ActivationType.RELU: nn.ReLU(inplace=True), - ActivationType.SILU: silu, - }[params.activation] + activation = params.activation(inplace=True) # Ad hoc stem - self.stem = { - StemType.RES_STEM_CIFAR: ResStemCifar, - StemType.RES_STEM_IN: ResStemIN, - StemType.SIMPLE_STEM_IN: SimpleStemIN, - }[params.stem_type]( + self.stem = params.stem_type( 3, # width_in params.stem_width, params.bn_epsilon, @@ -536,14 +510,6 @@ def __init__(self, params: RegNetParams) -> None: activation, ) - # Instantiate all the AnyNet blocks in the trunk - block_fun = { - BlockType.VANILLA_BLOCK: VanillaBlock, - BlockType.RES_BASIC_BLOCK: ResBasicBlock, - BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock, - BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock, - }[params.block_type] - current_width = params.stem_width self.trunk_depth = 0 @@ -564,11 +530,13 @@ def __init__(self, params: RegNetParams) -> None: width_out, stride, depth, - block_fun, + params.block_type, + params.bn_epsilon, + params.bn_momentum, activation, group_width, bottleneck_multiplier, - params, + params.se_ratio, stage_index=i + 1, ), ) From 48a6e36479f8f8bbdd285386d980cc11efe3e727 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 15 Sep 2021 21:30:52 +0000 Subject: [PATCH 25/40] add regnet_x and regnet_y model build functions, add docs --- torchvision/models/regnet.py | 193 ++++++++++++++++++++++++++++++++++- 1 file changed, 192 insertions(+), 1 deletion(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 76d3381fe5b..98c60296015 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -1,3 +1,8 @@ +# Modified from +# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py +# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py + + import numpy as np import math import torch @@ -10,7 +15,6 @@ from .._internally_replaced_utils import load_state_dict_from_url from torchvision.models.mobilenetv2 import _make_divisible - model_urls = { # TODO(kazhang): add pretrained weights "regnet_y_400m": "", @@ -590,5 +594,192 @@ def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, * def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_400MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) + + +def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_800MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs) + return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs) + + +def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_1.6GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs) + return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs) + + +def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_3.2GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs) + return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs) + + +def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_8GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs) + return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs) + + +def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_16GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs) + return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs) + + +def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetY_32GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs) + return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs) + + +def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_400MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, + group_width=16, use_se=False, **kwargs) + + return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) + + +def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_800MF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, + use_se=False, **kwargs) + return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs) + + +def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_1.6GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, + use_se=False, **kwargs) + return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs) + + +def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_3.2GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, + use_se=False, **kwargs) + return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs) + + +def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_8GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, + use_se=False, **kwargs) + return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs) + + +def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_16GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, + use_se=False, **kwargs) + return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs) + + +def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet: + """ + Constructs a RegNetX_32GF architecture from + `"Designing Network Design Spaces" `_. + + Args: + pretrained (bool): If True, returns a model pre-trained on ImageNet + progress (bool): If True, displays a progress bar of the download to stderr + """ + params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, + use_se=False, **kwargs) + return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs) + +# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF From 2dbcd6d7dfbae4056ed79d3ffe35ed80d3a09fea Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 16 Sep 2021 23:25:42 +0000 Subject: [PATCH 26/40] remove unused depth --- torchvision/models/regnet.py | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 98c60296015..d405945267e 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -8,7 +8,6 @@ import torch from collections import OrderedDict -from enum import Enum, auto from typing import Any, Callable, List, Optional, Tuple from torch import nn, Tensor @@ -83,7 +82,6 @@ def __init__( ) self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) - self.depth = 2 class ResStemCifar(nn.Sequential): @@ -103,7 +101,6 @@ def __init__( nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), activation, ) - self.depth = 2 class ResStemIN(nn.Sequential): @@ -124,7 +121,6 @@ def __init__( activation, nn.MaxPool2d(3, stride=2, padding=1), ) - self.depth = 3 class SimpleStemIN(nn.Sequential): @@ -144,7 +140,6 @@ def __init__( nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), activation, ) - self.depth = 2 class VanillaBlock(nn.Sequential): @@ -174,8 +169,6 @@ def __init__( activation, ) - self.depth = 2 - class ResBasicBlock(nn.Module): """Residual basic block: x + F(x), F = basic transform.""" @@ -203,10 +196,6 @@ def __init__( ) self.activation = activation - # The projection and transform happen in parallel, - # and ReLU is not counted with respect to depth - self.depth = self.f.depth - def forward(self, x: Tensor) -> Tensor: if self.proj_block: x = self.bn(self.proj(x)) + self.f(x) @@ -260,7 +249,6 @@ def __init__( self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) - self.depth = 3 if not se_ratio else 4 class ResBottleneckBlock(nn.Module): @@ -302,7 +290,6 @@ def __init__( # The projection and transform happen in parallel, # and activation is not counted with respect to depth - self.depth = self.f.depth def forward(self, x: Tensor) -> Tensor: if self.proj_block: @@ -341,8 +328,6 @@ def __init__( se_ratio, ) - self.depth = self.f.depth - def forward(self, x: Tensor) -> Tensor: return x + self.f(x) if self.has_skip else self.f(x) @@ -366,7 +351,6 @@ def __init__( stage_index: int = 0, ) -> None: super().__init__() - self.stage_depth = 0 for i in range(depth): block = block_constructor( @@ -381,7 +365,6 @@ def __init__( se_ratio, ) - self.stage_depth += block.depth self.add_module(f"block{stage_index}-{i}", block) @@ -516,8 +499,6 @@ def __init__(self, params: RegNetParams) -> None: current_width = params.stem_width - self.trunk_depth = 0 - blocks = [] for i, ( width_out, @@ -546,8 +527,6 @@ def __init__(self, params: RegNetParams) -> None: ) ) - self.trunk_depth += blocks[-1][1].stage_depth - current_width = width_out self.trunk_output = nn.Sequential(OrderedDict(blocks)) @@ -695,7 +674,6 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An """ params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, group_width=16, use_se=False, **kwargs) - return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) From baca24fee5fd3a9d2308bccbbb37ae89d5b9eba5 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Fri, 17 Sep 2021 07:10:50 +0000 Subject: [PATCH 27/40] use BN/activation constructor and ConvBNActivation --- torchvision/models/regnet.py | 187 +++++++++++++---------------------- 1 file changed, 66 insertions(+), 121 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index d405945267e..9c3330ec726 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -8,11 +8,12 @@ import torch from collections import OrderedDict +from functools import partial from typing import Any, Callable, List, Optional, Tuple from torch import nn, Tensor from .._internally_replaced_utils import load_state_dict_from_url -from torchvision.models.mobilenetv2 import _make_divisible +from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible model_urls = { # TODO(kazhang): add pretrained weights @@ -68,39 +69,17 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], ) -> None: - super().__init__() - - self.a = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), - ) - - self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) - - -class ResStemCifar(nn.Sequential): - """ResNet stem for CIFAR: 3x3, BN, ReLU.""" - - def __init__( - self, - width_in: int, - width_out: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, - ) -> None: - super().__init__() - self.stem = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + super().__init__(OrderedDict( + a=nn.Sequential( + ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, + norm_layer=norm_layer, activation_layer=activation_layer), + nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), + ), + final_bn=norm_layer(width_out), + )) class ResStemIN(nn.Sequential): @@ -110,36 +89,28 @@ def __init__( self, width_in: int, width_out: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], ) -> None: - super().__init__() - self.stem = nn.Sequential( - nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, + super().__init__( + ConvBNActivation(width_in, width_out, kernel_size=7, stride=2, + norm_layer=norm_layer, activation_layer=activation_layer), nn.MaxPool2d(3, stride=2, padding=1), ) -class SimpleStemIN(nn.Sequential): +class SimpleStemIN(ConvBNActivation): """Simple stem for ImageNet: 3x3, BN, ReLU.""" def __init__( self, width_in: int, width_out: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], ) -> None: - super().__init__() - self.stem = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + super().__init__(width_in, width_out, kernel_size=3, stride=2, + norm_layer=norm_layer, activation_layer=activation_layer) class VanillaBlock(nn.Sequential): @@ -150,24 +121,17 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], *args, **kwargs, ) -> None: - super().__init__() - self.a = nn.Sequential( - nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) - - self.b = nn.Sequential( - nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), - nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + super().__init__(OrderedDict( + a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, + norm_layer=norm_layer, activation_layer=activation_layer), + b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1, + norm_layer=norm_layer, activation_layer=activation_layer), + )) class ResBasicBlock(nn.Module): @@ -178,9 +142,8 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], *args, **kwargs, ) -> None: @@ -190,11 +153,11 @@ def __init__( self.proj = nn.Conv2d( width_in, width_out, 1, stride=stride, padding=0, bias=False ) - self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.bn = norm_layer(width_out) self.f = BasicTransform( - width_in, width_out, stride, bn_epsilon, bn_momentum, activation + width_in, width_out, stride, norm_layer, activation_layer ) - self.activation = activation + self.activation = activation_layer(inplace=True) def forward(self, x: Tensor) -> Tensor: if self.proj_block: @@ -213,42 +176,35 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int, bottleneck_multiplier: float, se_ratio: Optional[float], ) -> None: - super().__init__() + layers = OrderedDict() w_b = int(round(width_out * bottleneck_multiplier)) g = w_b // group_width - self.a = nn.Sequential( - nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False), - nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) - - self.b = nn.Sequential( - nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False), - nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum), - activation, - ) + layers["a"] = ConvBNActivation(width_in, w_b, kernel_size=1, stride=1, + norm_layer=norm_layer, activation_layer=activation_layer) + layers["b"] = ConvBNActivation(w_b, w_b, kernel_size=3, stride=stride, groups=g, + norm_layer=norm_layer, activation_layer=activation_layer) if se_ratio: # The SE reduction ratio is defined with respect to the # beginning of the block width_se_out = int(round(se_ratio * width_in)) - self.se = _SqueezeExcitation( + layers["se"] = _SqueezeExcitation( in_channels=w_b, reduction_ratio=None, reduced_channels=width_se_out, - activation=activation, + activation=activation_layer(inplace=True), ) - self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) - self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) + layers["final_bn"] = norm_layer(width_out) + super().__init__(layers) class ResBottleneckBlock(nn.Module): @@ -259,9 +215,8 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int = 1, bottleneck_multiplier: float = 1.0, se_ratio: Optional[float] = None, @@ -274,19 +229,18 @@ def __init__( self.proj = nn.Conv2d( width_in, width_out, 1, stride=stride, padding=0, bias=False ) - self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum) + self.bn = norm_layer(width_out) self.f = BottleneckTransform( width_in, width_out, stride, - bn_epsilon, - bn_momentum, - activation, + norm_layer, + activation_layer, group_width, bottleneck_multiplier, se_ratio, ) - self.activation = activation + self.activation = activation_layer(inplace=True) # The projection and transform happen in parallel, # and activation is not counted with respect to depth @@ -307,9 +261,8 @@ def __init__( width_in: int, width_out: int, stride: int, - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int = 1, bottleneck_multiplier: float = 4.0, se_ratio: Optional[float] = None, @@ -320,9 +273,8 @@ def __init__( width_in, width_out, stride, - bn_epsilon, - bn_momentum, - activation, + norm_layer, + activation_layer, group_width, bottleneck_multiplier, se_ratio, @@ -342,9 +294,8 @@ def __init__( stride: int, depth: int, block_constructor: Callable[..., nn.Module], - bn_epsilon: float, - bn_momentum: float, - activation: nn.Module, + norm_layer: Callable[..., nn.Module], + activation_layer: Callable[..., nn.Module], group_width: int, bottleneck_multiplier: float, se_ratio: Optional[float] = None, @@ -357,9 +308,8 @@ def __init__( width_in if i == 0 else width_out, width_out, stride if i == 0 else 1, - bn_epsilon, - bn_momentum, - activation, + norm_layer, + activation_layer, group_width, bottleneck_multiplier, se_ratio, @@ -398,11 +348,10 @@ def __init__( self.stem_type = stem_type self.block_type = block_type self.activation = activation + self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum) self.stem_width = stem_width self.use_se = use_se self.se_ratio = se_ratio if use_se else None - self.bn_epsilon = bn_epsilon - self.bn_momentum = bn_momentum self.num_classes = num_classes def get_expanded_params(self): @@ -486,15 +435,12 @@ class RegNet(nn.Module): def __init__(self, params: RegNetParams) -> None: super().__init__() - activation = params.activation(inplace=True) - # Ad hoc stem self.stem = params.stem_type( 3, # width_in params.stem_width, - params.bn_epsilon, - params.bn_momentum, - activation, + params.norm_layer, + params.activation, ) current_width = params.stem_width @@ -516,9 +462,8 @@ def __init__(self, params: RegNetParams) -> None: stride, depth, params.block_type, - params.bn_epsilon, - params.bn_momentum, - activation, + params.norm_layer, + params.activation, group_width, bottleneck_multiplier, params.se_ratio, From 233bdff2c1c256cec38d2b1a4c7cf747c5694051 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Mon, 20 Sep 2021 00:02:08 +0000 Subject: [PATCH 28/40] reuse SqueezeExcitation from efficientnet --- torchvision/models/regnet.py | 50 ++++-------------------------------- 1 file changed, 5 insertions(+), 45 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 9c3330ec726..32b4bc9d4b2 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -14,6 +14,7 @@ from .._internally_replaced_utils import load_state_dict_from_url from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible +from torchvision.models.efficientnet import SqueezeExcitation model_urls = { # TODO(kazhang): add pretrained weights @@ -21,46 +22,6 @@ } -class _SqueezeExcitation(nn.Module): - """ - Squeeze and excitation layer from - `"Squeeze-and-Excitation Networks" `_. - """ - - def __init__( - self, - in_channels: int, - reduction_ratio: Optional[int] = 16, - reduced_channels: Optional[int] = None, - activation: Optional[nn.Module] = None, - ) -> None: - super().__init__() - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - - # Either reduction_ratio is defined, or out_channels is defined, - # neither both nor none of them - assert bool(reduction_ratio) != bool(reduced_channels) - - if activation is None: - activation = nn.ReLU() - - reduced_channels = ( - in_channels // reduction_ratio if reduced_channels is None else reduced_channels - ) - self.excitation = nn.Sequential( - nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True), - activation, - nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True), - nn.Sigmoid(), - ) - - def forward(self, x: Tensor) -> Tensor: - x_squeezed = self.avgpool(x) - x_excited = self.excitation(x_squeezed) - x_scaled = x * x_excited - return x_scaled - - class BasicTransform(nn.Sequential): """Basic transformation: [3x3 conv, BN, Relu] x2.""" @@ -195,11 +156,10 @@ def __init__( # The SE reduction ratio is defined with respect to the # beginning of the block width_se_out = int(round(se_ratio * width_in)) - layers["se"] = _SqueezeExcitation( - in_channels=w_b, - reduction_ratio=None, - reduced_channels=width_se_out, - activation=activation_layer(inplace=True), + layers["se"] = SqueezeExcitation( + input_channels=w_b, + squeeze_channels=width_se_out, + activation=activation_layer, ) layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) From 0968d279ba072136a4d28b6c8ef2a7951cec847d Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Mon, 20 Sep 2021 21:04:35 +0000 Subject: [PATCH 29/40] refactor RegNetParams into BlockParams --- torchvision/models/regnet.py | 103 ++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 49 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 32b4bc9d4b2..7e94bf04978 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -278,7 +278,7 @@ def __init__( self.add_module(f"block{stage_index}-{i}", block) -class RegNetParams: +class BlockParams: def __init__( self, depth: int, @@ -287,15 +287,8 @@ def __init__( w_m: float, group_width: int, bottleneck_multiplier: float = 1.0, - stem_type: Callable[..., nn.Module] = SimpleStemIN, - stem_width: int = 32, - block_type: Callable[..., nn.Module] = ResBottleneckBlock, - activation: Callable[..., nn.Module] = nn.ReLU, use_se: bool = True, se_ratio: float = 0.25, - bn_epsilon: float = 1e-05, - bn_momentum: float = 0.1, - num_classes: int = 1000, ) -> None: if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: raise ValueError("Invalid RegNet settings") @@ -305,14 +298,8 @@ def __init__( self.w_m = w_m self.group_width = group_width self.bottleneck_multiplier = bottleneck_multiplier - self.stem_type = stem_type - self.block_type = block_type - self.activation = activation - self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum) - self.stem_width = stem_width self.use_se = use_se self.se_ratio = se_ratio if use_se else None - self.num_classes = num_classes def get_expanded_params(self): """ @@ -392,18 +379,36 @@ def _adjust_widths_groups_compatibilty( class RegNet(nn.Module): - def __init__(self, params: RegNetParams) -> None: + def __init__( + self, + block_params: BlockParams, + num_classes: int = 1000, + stem_width: int = 32, + stem_type: Optional[Callable[..., nn.Module]] = None, + block_type: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + activation: Optional[Callable[..., nn.Module]] = None, + ) -> None: super().__init__() + if stem_type is None: + stem_type = SimpleStemIN + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if block_type is None: + block_type = ResBottleneckBlock + if activation is None: + activation = nn.ReLU + # Ad hoc stem - self.stem = params.stem_type( + self.stem = stem_type( 3, # width_in - params.stem_width, - params.norm_layer, - params.activation, + stem_width, + norm_layer, + activation, ) - current_width = params.stem_width + current_width = stem_width blocks = [] for i, ( @@ -412,7 +417,7 @@ def __init__(self, params: RegNetParams) -> None: depth, group_width, bottleneck_multiplier, - ) in enumerate(params.get_expanded_params()): + ) in enumerate(block_params.get_expanded_params()): blocks.append( ( f"block{i+1}", @@ -421,12 +426,12 @@ def __init__(self, params: RegNetParams) -> None: width_out, stride, depth, - params.block_type, - params.norm_layer, - params.activation, + block_type, + norm_layer, + activation, group_width, bottleneck_multiplier, - params.se_ratio, + block_params.se_ratio, stage_index=i + 1, ), ) @@ -437,7 +442,7 @@ def __init__(self, params: RegNetParams) -> None: self.trunk_output = nn.Sequential(OrderedDict(blocks)) self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes) + self.fc = nn.Linear(in_features=current_width, out_features=num_classes) # Init weights and good to go self._init_weights() @@ -467,8 +472,8 @@ def _init_weights(self) -> None: m.bias.data.zero_() -def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: - model = RegNet(params) +def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: + model = RegNet(block_params, norm_layer=partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1), **kwargs) if pretrained: if arch not in model_urls: raise ValueError(f"No checkpoint is available for model type {arch}") @@ -486,7 +491,7 @@ def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) + params = BlockParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) @@ -499,7 +504,7 @@ def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs) + params = BlockParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs) return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs) @@ -512,7 +517,7 @@ def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs) + params = BlockParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs) return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs) @@ -525,7 +530,7 @@ def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs) + params = BlockParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs) return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs) @@ -538,7 +543,7 @@ def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs) + params = BlockParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs) return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs) @@ -551,7 +556,7 @@ def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs) + params = BlockParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs) return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs) @@ -564,7 +569,7 @@ def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs) + params = BlockParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs) return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs) @@ -577,8 +582,8 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, - group_width=16, use_se=False, **kwargs) + params = BlockParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, + group_width=16, use_se=False, **kwargs) return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) @@ -591,8 +596,8 @@ def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, - use_se=False, **kwargs) + params = BlockParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, + use_se=False, **kwargs) return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs) @@ -605,8 +610,8 @@ def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, - use_se=False, **kwargs) + params = BlockParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, + use_se=False, **kwargs) return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs) @@ -619,8 +624,8 @@ def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, - use_se=False, **kwargs) + params = BlockParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, + use_se=False, **kwargs) return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs) @@ -633,8 +638,8 @@ def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, - use_se=False, **kwargs) + params = BlockParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, + use_se=False, **kwargs) return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs) @@ -647,8 +652,8 @@ def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, - use_se=False, **kwargs) + params = BlockParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, + use_se=False, **kwargs) return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs) @@ -661,8 +666,8 @@ def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, - use_se=False, **kwargs) + params = BlockParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, + use_se=False, **kwargs) return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs) # TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF From 2417685c8e7154e2d27c9286a7bc6d400bf4e34c Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Mon, 20 Sep 2021 22:15:03 +0000 Subject: [PATCH 30/40] use nn.init, replace np with torch --- torchvision/models/regnet.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 7e94bf04978..0b49bc17fb8 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -3,7 +3,6 @@ # https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py -import numpy as np import math import torch @@ -326,14 +325,14 @@ def get_expanded_params(self): STRIDE = 2 # Compute the block widths. Each stage has one unique block width - widths_cont = np.arange(self.depth) * self.w_a + self.w_0 - block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m)) + widths_cont = torch.arange(self.depth) * self.w_a + self.w_0 + block_capacity = torch.round(torch.log(widths_cont / self.w_0) / math.log(self.w_m)) block_widths = ( - np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT)) + torch.round(torch.divide(self.w_0 * torch.pow(self.w_m, block_capacity), QUANT)) * QUANT - ) - num_stages = len(np.unique(block_widths)) - block_widths = block_widths.astype(int).tolist() + ).int() + num_stages = len(torch.unique(block_widths)) + block_widths = block_widths.tolist() # Convert to per stage parameters split_helper = zip( @@ -345,7 +344,7 @@ def get_expanded_params(self): splits = [w != wp or r != rp for w, wp, r, rp in split_helper] stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] - stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist() + stage_depths = torch.diff(torch.Tensor([d for d, t in enumerate(splits) if t])).int().tolist() strides = [STRIDE] * num_stages bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages @@ -445,7 +444,7 @@ def __init__( self.fc = nn.Linear(in_features=current_width, out_features=num_classes) # Init weights and good to go - self._init_weights() + self.reset_parameters() def forward(self, x: Tensor) -> Tensor: x = self.stem(x) @@ -457,19 +456,19 @@ def forward(self, x: Tensor) -> Tensor: return x - def _init_weights(self) -> None: + def reset_parameters(self) -> None: # Performs ResNet-style weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d): # Note that there is no bias due to BN fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels - m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out)) + nn.init.normal_(m.weight, mean=0.0, std=math.sqrt(2.0 / fan_out)) elif isinstance(m, nn.BatchNorm2d): - m.weight.data.fill_(1.0) - m.bias.data.zero_() + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) elif isinstance(m, nn.Linear): - m.weight.data.normal_(mean=0.0, std=0.01) - m.bias.data.zero_() + nn.init.normal_(m.weight, mean=0.0, std=0.01) + nn.init.zeros_(m.bias) def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: From f3b3e9667b073a0071be9f76a655ab49ba18cb9e Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Mon, 20 Sep 2021 22:40:50 +0000 Subject: [PATCH 31/40] update README --- references/classification/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/references/classification/README.md b/references/classification/README.md index 79149758428..5d945d2728d 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -82,8 +82,7 @@ The weights of the B5-B7 variants are ported from Luke Melas' [EfficientNet-PyTo ### RegNet ``` -python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\ - --model regnet_y_400mf --epochs 100 +torchrun --nproc_per_node=8 train.py --model regnet_y_400mf --epochs 100 --batch-size 128 ``` ## Mixed precision training From e60e4daad310fb52bdae176327c9b9ebe4c2eb10 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Tue, 21 Sep 2021 05:59:32 +0000 Subject: [PATCH 32/40] construct model with stem, block, classifier instances --- torchvision/models/regnet.py | 154 +++++++++++++++++++---------------- 1 file changed, 85 insertions(+), 69 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 0b49bc17fb8..9e2a574356e 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -73,6 +73,20 @@ def __init__( norm_layer=norm_layer, activation_layer=activation_layer) +def _make_stem( + stem_width: int, + norm_layer: Callable[..., nn.Module], + activation: Callable[..., nn.Module], + stem_type: Callable[..., nn.Module] = SimpleStemIN, +) -> nn.Module: + return stem_type( + 3, # width_in + stem_width, + norm_layer, + activation, + ) + + class VanillaBlock(nn.Sequential): """Vanilla block: [3x3 conv, BN, Relu] x2.""" @@ -201,9 +215,6 @@ def __init__( ) self.activation = activation_layer(inplace=True) - # The projection and transform happen in parallel, - # and activation is not counted with respect to depth - def forward(self, x: Tensor) -> Tensor: if self.proj_block: x = self.bn(self.proj(x)) + self.f(x) @@ -288,6 +299,7 @@ def __init__( bottleneck_multiplier: float = 1.0, use_se: bool = True, se_ratio: float = 0.25, + **kwargs: Any, ) -> None: if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: raise ValueError("Invalid RegNet settings") @@ -377,83 +389,79 @@ def _adjust_widths_groups_compatibilty( return stage_widths, group_widths_min -class RegNet(nn.Module): - def __init__( - self, - block_params: BlockParams, - num_classes: int = 1000, - stem_width: int = 32, - stem_type: Optional[Callable[..., nn.Module]] = None, - block_type: Optional[Callable[..., nn.Module]] = None, - norm_layer: Optional[Callable[..., nn.Module]] = None, - activation: Optional[Callable[..., nn.Module]] = None, - ) -> None: - super().__init__() - - if stem_type is None: - stem_type = SimpleStemIN - if norm_layer is None: - norm_layer = nn.BatchNorm2d - if block_type is None: - block_type = ResBottleneckBlock - if activation is None: - activation = nn.ReLU - - # Ad hoc stem - self.stem = stem_type( - 3, # width_in - stem_width, - norm_layer, - activation, +def _make_blocks( + stem_width: int, + params: BlockParams, + norm_layer: Callable[..., nn.Module], + activation: Callable[..., nn.Module], + block_type: Callable[..., nn.Module] = ResBottleneckBlock, +) -> Tuple[nn.Sequential, int]: + current_width = stem_width + + blocks = [] + for i, ( + width_out, + stride, + depth, + group_width, + bottleneck_multiplier, + ) in enumerate(params.get_expanded_params()): + blocks.append( + ( + f"block{i+1}", + AnyStage( + current_width, + width_out, + stride, + depth, + block_type, + norm_layer, + activation, + group_width, + bottleneck_multiplier, + params.se_ratio, + stage_index=i + 1, + ), + ) ) - current_width = stem_width + current_width = width_out + return (nn.Sequential(OrderedDict(blocks)), current_width) - blocks = [] - for i, ( - width_out, - stride, - depth, - group_width, - bottleneck_multiplier, - ) in enumerate(block_params.get_expanded_params()): - blocks.append( - ( - f"block{i+1}", - AnyStage( - current_width, - width_out, - stride, - depth, - block_type, - norm_layer, - activation, - group_width, - bottleneck_multiplier, - block_params.se_ratio, - stage_index=i + 1, - ), - ) - ) - current_width = width_out +class Classifier(nn.Module): + def __init__(self, in_channels: int, num_classes: int = 1000) -> None: + super().__init__() + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(in_features=in_channels, out_features=num_classes) - self.trunk_output = nn.Sequential(OrderedDict(blocks)) + def forward(self, x: Tensor) -> Tensor: + x = self.avgpool(x) + x = x.flatten(start_dim=1) + x = self.fc(x) + return x - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = nn.Linear(in_features=current_width, out_features=num_classes) + +class RegNet(nn.Module): + def __init__( + self, + stem: nn.Module, + blocks: nn.Module, + classifier: nn.Module, + **kwargs: Any, + ) -> None: + super().__init__() + self.stem = stem + self.blocks = blocks + self.classifier = classifier # Init weights and good to go self.reset_parameters() def forward(self, x: Tensor) -> Tensor: x = self.stem(x) - x = self.trunk_output(x) - - x = self.avgpool(x) - x = x.flatten(start_dim=1) - x = self.fc(x) - + x = self.blocks(x) + x = self.classifier(x) return x def reset_parameters(self) -> None: @@ -472,7 +480,15 @@ def reset_parameters(self) -> None: def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: - model = RegNet(block_params, norm_layer=partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1), **kwargs) + norm_layer = kwargs["norm_layer"] if "norm_layer" in kwargs else partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1) + activation = kwargs["activation"] if "activation" in kwargs else nn.ReLU + num_classes = kwargs["num_classes"] if "num_classes" in kwargs else 1000 + + stem_width = 32 + stem = _make_stem(stem_width, norm_layer=norm_layer, activation=activation) + blocks, out_channels = _make_blocks(stem_width, params=block_params, norm_layer=norm_layer, activation=activation) + classifier = Classifier(out_channels, num_classes) + model = RegNet(stem, blocks, classifier) if pretrained: if arch not in model_urls: raise ValueError(f"No checkpoint is available for model type {arch}") From 27da2c735b6338f468204ba76d07daace1fbeba5 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 22 Sep 2021 18:51:58 +0000 Subject: [PATCH 33/40] Revert "construct model with stem, block, classifier instances" This reverts commit 850f5f3ed01a2a9b36fcbf8405afd6e41d2e58ef. --- torchvision/models/regnet.py | 154 ++++++++++++++++------------------- 1 file changed, 69 insertions(+), 85 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 9e2a574356e..0b49bc17fb8 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -73,20 +73,6 @@ def __init__( norm_layer=norm_layer, activation_layer=activation_layer) -def _make_stem( - stem_width: int, - norm_layer: Callable[..., nn.Module], - activation: Callable[..., nn.Module], - stem_type: Callable[..., nn.Module] = SimpleStemIN, -) -> nn.Module: - return stem_type( - 3, # width_in - stem_width, - norm_layer, - activation, - ) - - class VanillaBlock(nn.Sequential): """Vanilla block: [3x3 conv, BN, Relu] x2.""" @@ -215,6 +201,9 @@ def __init__( ) self.activation = activation_layer(inplace=True) + # The projection and transform happen in parallel, + # and activation is not counted with respect to depth + def forward(self, x: Tensor) -> Tensor: if self.proj_block: x = self.bn(self.proj(x)) + self.f(x) @@ -299,7 +288,6 @@ def __init__( bottleneck_multiplier: float = 1.0, use_se: bool = True, se_ratio: float = 0.25, - **kwargs: Any, ) -> None: if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: raise ValueError("Invalid RegNet settings") @@ -389,79 +377,83 @@ def _adjust_widths_groups_compatibilty( return stage_widths, group_widths_min -def _make_blocks( - stem_width: int, - params: BlockParams, - norm_layer: Callable[..., nn.Module], - activation: Callable[..., nn.Module], - block_type: Callable[..., nn.Module] = ResBottleneckBlock, -) -> Tuple[nn.Sequential, int]: - current_width = stem_width - - blocks = [] - for i, ( - width_out, - stride, - depth, - group_width, - bottleneck_multiplier, - ) in enumerate(params.get_expanded_params()): - blocks.append( - ( - f"block{i+1}", - AnyStage( - current_width, - width_out, - stride, - depth, - block_type, - norm_layer, - activation, - group_width, - bottleneck_multiplier, - params.se_ratio, - stage_index=i + 1, - ), - ) - ) +class RegNet(nn.Module): + def __init__( + self, + block_params: BlockParams, + num_classes: int = 1000, + stem_width: int = 32, + stem_type: Optional[Callable[..., nn.Module]] = None, + block_type: Optional[Callable[..., nn.Module]] = None, + norm_layer: Optional[Callable[..., nn.Module]] = None, + activation: Optional[Callable[..., nn.Module]] = None, + ) -> None: + super().__init__() - current_width = width_out - return (nn.Sequential(OrderedDict(blocks)), current_width) + if stem_type is None: + stem_type = SimpleStemIN + if norm_layer is None: + norm_layer = nn.BatchNorm2d + if block_type is None: + block_type = ResBottleneckBlock + if activation is None: + activation = nn.ReLU + + # Ad hoc stem + self.stem = stem_type( + 3, # width_in + stem_width, + norm_layer, + activation, + ) + current_width = stem_width -class Classifier(nn.Module): - def __init__(self, in_channels: int, num_classes: int = 1000) -> None: - super().__init__() - self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) - self.fc = nn.Linear(in_features=in_channels, out_features=num_classes) + blocks = [] + for i, ( + width_out, + stride, + depth, + group_width, + bottleneck_multiplier, + ) in enumerate(block_params.get_expanded_params()): + blocks.append( + ( + f"block{i+1}", + AnyStage( + current_width, + width_out, + stride, + depth, + block_type, + norm_layer, + activation, + group_width, + bottleneck_multiplier, + block_params.se_ratio, + stage_index=i + 1, + ), + ) + ) - def forward(self, x: Tensor) -> Tensor: - x = self.avgpool(x) - x = x.flatten(start_dim=1) - x = self.fc(x) - return x + current_width = width_out + self.trunk_output = nn.Sequential(OrderedDict(blocks)) -class RegNet(nn.Module): - def __init__( - self, - stem: nn.Module, - blocks: nn.Module, - classifier: nn.Module, - **kwargs: Any, - ) -> None: - super().__init__() - self.stem = stem - self.blocks = blocks - self.classifier = classifier + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(in_features=current_width, out_features=num_classes) # Init weights and good to go self.reset_parameters() def forward(self, x: Tensor) -> Tensor: x = self.stem(x) - x = self.blocks(x) - x = self.classifier(x) + x = self.trunk_output(x) + + x = self.avgpool(x) + x = x.flatten(start_dim=1) + x = self.fc(x) + return x def reset_parameters(self) -> None: @@ -480,15 +472,7 @@ def reset_parameters(self) -> None: def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet: - norm_layer = kwargs["norm_layer"] if "norm_layer" in kwargs else partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1) - activation = kwargs["activation"] if "activation" in kwargs else nn.ReLU - num_classes = kwargs["num_classes"] if "num_classes" in kwargs else 1000 - - stem_width = 32 - stem = _make_stem(stem_width, norm_layer=norm_layer, activation=activation) - blocks, out_channels = _make_blocks(stem_width, params=block_params, norm_layer=norm_layer, activation=activation) - classifier = Classifier(out_channels, num_classes) - model = RegNet(stem, blocks, classifier) + model = RegNet(block_params, norm_layer=partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1), **kwargs) if pretrained: if arch not in model_urls: raise ValueError(f"No checkpoint is available for model type {arch}") From ddf53837825fe02f72610747c0086b74760da4d0 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 22 Sep 2021 21:34:07 +0000 Subject: [PATCH 34/40] remove unused blocks --- torchvision/models/efficientnet.py | 2 + torchvision/models/regnet.py | 128 +---------------------------- 2 files changed, 3 insertions(+), 127 deletions(-) diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py index dbfb6bb7dd7..bad5b57b25b 100644 --- a/torchvision/models/efficientnet.py +++ b/torchvision/models/efficientnet.py @@ -37,11 +37,13 @@ def __init__( input_channels: int, squeeze_channels: int, activation: Callable[..., nn.Module] = nn.ReLU, + scale_activation: Callable[..., nn.Module] = nn.Sigmoid, ) -> None: super().__init__() self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1) self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1) self.activation = activation() + self.scale_activation = scale_activation() def _scale(self, input: Tensor) -> Tensor: scale = F.adaptive_avg_pool2d(input, 1) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 0b49bc17fb8..eb2ef1bdabe 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -21,44 +21,6 @@ } -class BasicTransform(nn.Sequential): - """Basic transformation: [3x3 conv, BN, Relu] x2.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - ) -> None: - super().__init__(OrderedDict( - a=nn.Sequential( - ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, - norm_layer=norm_layer, activation_layer=activation_layer), - nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False), - ), - final_bn=norm_layer(width_out), - )) - - -class ResStemIN(nn.Sequential): - """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool.""" - - def __init__( - self, - width_in: int, - width_out: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - ) -> None: - super().__init__( - ConvBNActivation(width_in, width_out, kernel_size=7, stride=2, - norm_layer=norm_layer, activation_layer=activation_layer), - nn.MaxPool2d(3, stride=2, padding=1), - ) - - class SimpleStemIN(ConvBNActivation): """Simple stem for ImageNet: 3x3, BN, ReLU.""" @@ -73,61 +35,6 @@ def __init__( norm_layer=norm_layer, activation_layer=activation_layer) -class VanillaBlock(nn.Sequential): - """Vanilla block: [3x3 conv, BN, Relu] x2.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - *args, - **kwargs, - ) -> None: - super().__init__(OrderedDict( - a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride, - norm_layer=norm_layer, activation_layer=activation_layer), - b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1, - norm_layer=norm_layer, activation_layer=activation_layer), - )) - - -class ResBasicBlock(nn.Module): - """Residual basic block: x + F(x), F = basic transform.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - *args, - **kwargs, - ) -> None: - super().__init__() - self.proj_block = (width_in != width_out) or (stride != 1) - if self.proj_block: - self.proj = nn.Conv2d( - width_in, width_out, 1, stride=stride, padding=0, bias=False - ) - self.bn = norm_layer(width_out) - self.f = BasicTransform( - width_in, width_out, stride, norm_layer, activation_layer - ) - self.activation = activation_layer(inplace=True) - - def forward(self, x: Tensor) -> Tensor: - if self.proj_block: - x = self.bn(self.proj(x)) + self.f(x) - else: - x = x + self.f(x) - - return self.activation(x) - - class BottleneckTransform(nn.Sequential): """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1.""" @@ -201,9 +108,6 @@ def __init__( ) self.activation = activation_layer(inplace=True) - # The projection and transform happen in parallel, - # and activation is not counted with respect to depth - def forward(self, x: Tensor) -> Tensor: if self.proj_block: x = self.bn(self.proj(x)) + self.f(x) @@ -212,37 +116,6 @@ def forward(self, x: Tensor) -> Tensor: return self.activation(x) -class ResBottleneckLinearBlock(nn.Module): - """Residual linear bottleneck block: x + F(x), F = bottleneck transform.""" - - def __init__( - self, - width_in: int, - width_out: int, - stride: int, - norm_layer: Callable[..., nn.Module], - activation_layer: Callable[..., nn.Module], - group_width: int = 1, - bottleneck_multiplier: float = 4.0, - se_ratio: Optional[float] = None, - ) -> None: - super().__init__() - self.has_skip = (width_in == width_out) and (stride == 1) - self.f = BottleneckTransform( - width_in, - width_out, - stride, - norm_layer, - activation_layer, - group_width, - bottleneck_multiplier, - se_ratio, - ) - - def forward(self, x: Tensor) -> Tensor: - return x + self.f(x) if self.has_skip else self.f(x) - - class AnyStage(nn.Sequential): """AnyNet stage (sequence of blocks w/ the same output shape).""" @@ -288,6 +161,7 @@ def __init__( bottleneck_multiplier: float = 1.0, use_se: bool = True, se_ratio: float = 0.25, + **kwargs: Any, ) -> None: if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: raise ValueError("Invalid RegNet settings") From 293073d6d6d40c7403ee2be719159a75f80a272c Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 22 Sep 2021 22:19:49 +0000 Subject: [PATCH 35/40] support scaled model --- torchvision/models/regnet.py | 111 +++++++++++++++++++++-------------- 1 file changed, 67 insertions(+), 44 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index eb2ef1bdabe..7b9864b96f9 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -153,28 +153,32 @@ def __init__( class BlockParams: def __init__( self, + depths: List[int], + widths: List[int], + group_widths: List[int], + bottleneck_multipliers: List[int], + strides: List[int], + se_ratio: Optional[float] = None, + ) -> None: + self.depths = depths + self.widths = widths + self.group_widths = group_widths + self.bottleneck_multipliers = bottleneck_multipliers + self.strides = strides + self.se_ratio = se_ratio + + @classmethod + def from_init_params( + cls, depth: int, w_0: int, w_a: float, w_m: float, group_width: int, bottleneck_multiplier: float = 1.0, - use_se: bool = True, - se_ratio: float = 0.25, + se_ratio: Optional[float] = None, **kwargs: Any, - ) -> None: - if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: - raise ValueError("Invalid RegNet settings") - self.depth = depth - self.w_0 = w_0 - self.w_a = w_a - self.w_m = w_m - self.group_width = group_width - self.bottleneck_multiplier = bottleneck_multiplier - self.use_se = use_se - self.se_ratio = se_ratio if use_se else None - - def get_expanded_params(self): + ) -> "BlockParams": """ Programatically compute all the per-block settings, given the RegNet parameters. @@ -198,11 +202,13 @@ def get_expanded_params(self): QUANT = 8 STRIDE = 2 + if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0: + raise ValueError("Invalid RegNet settings") # Compute the block widths. Each stage has one unique block width - widths_cont = torch.arange(self.depth) * self.w_a + self.w_0 - block_capacity = torch.round(torch.log(widths_cont / self.w_0) / math.log(self.w_m)) + widths_cont = torch.arange(depth) * w_a + w_0 + block_capacity = torch.round(torch.log(widths_cont / w_0) / math.log(w_m)) block_widths = ( - torch.round(torch.divide(self.w_0 * torch.pow(self.w_m, block_capacity), QUANT)) + torch.round(torch.divide(w_0 * torch.pow(w_m, block_capacity), QUANT)) * QUANT ).int() num_stages = len(torch.unique(block_widths)) @@ -221,16 +227,26 @@ def get_expanded_params(self): stage_depths = torch.diff(torch.Tensor([d for d, t in enumerate(splits) if t])).int().tolist() strides = [STRIDE] * num_stages - bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages - group_widths = [self.group_width] * num_stages + bottleneck_multipliers = [bottleneck_multiplier] * num_stages + group_widths = [group_width] * num_stages # Adjust the compatibility of stage widths and group widths - stage_widths, group_widths = self._adjust_widths_groups_compatibilty( + stage_widths, group_widths = cls._adjust_widths_groups_compatibilty( stage_widths, bottleneck_multipliers, group_widths ) + return cls( + depths=stage_depths, + widths=stage_widths, + group_widths=group_widths, + bottleneck_multipliers=bottleneck_multipliers, + strides=strides, + se_ratio=se_ratio, + ) + + def _get_expanded_params(self): return zip( - stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers + self.widths, self.strides, self.depths, self.group_widths, self.bottleneck_multipliers ) @staticmethod @@ -290,7 +306,7 @@ def __init__( depth, group_width, bottleneck_multiplier, - ) in enumerate(block_params.get_expanded_params()): + ) in enumerate(block_params._get_expanded_params()): blocks.append( ( f"block{i+1}", @@ -364,7 +380,8 @@ def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs) + params = BlockParams.from_init_params(depth=16, w_0=48, w_a=27.89, w_m=2.09, + group_width=8, se_ratio=0.25, **kwargs) return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs) @@ -377,7 +394,8 @@ def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs) + params = BlockParams.from_init_params(depth=14, w_0=56, w_a=38.84, w_m=2.4, + group_width=16, se_ratio=0.25, **kwargs) return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs) @@ -390,7 +408,8 @@ def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs) + params = BlockParams.from_init_params(depth=27, w_0=48, w_a=20.71, w_m=2.65, + group_width=24, se_ratio=0.25, **kwargs) return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs) @@ -403,7 +422,8 @@ def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs) + params = BlockParams.from_init_params(depth=21, w_0=80, w_a=42.63, w_m=2.66, + group_width=24, se_ratio=0.25, **kwargs) return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs) @@ -416,7 +436,8 @@ def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs) + params = BlockParams.from_init_params(depth=17, w_0=192, w_a=76.82, w_m=2.19, + group_width=56, se_ratio=0.25, **kwargs) return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs) @@ -429,7 +450,8 @@ def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs) + params = BlockParams.from_init_params(depth=18, w_0=200, w_a=106.23, w_m=2.48, + group_width=112, se_ratio=0.25, **kwargs) return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs) @@ -442,7 +464,8 @@ def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs) + params = BlockParams.from_init_params(depth=20, w_0=232, w_a=115.89, w_m=2.53, + group_width=232, se_ratio=0.25, **kwargs) return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs) @@ -455,8 +478,8 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=22, w_0=24, w_a=24.48, w_m=2.54, - group_width=16, use_se=False, **kwargs) + params = BlockParams.from_init_params(depth=22, w_0=24, w_a=24.48, w_m=2.54, + group_width=16, **kwargs) return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs) @@ -469,8 +492,8 @@ def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16, - use_se=False, **kwargs) + params = BlockParams.from_init_params(depth=16, w_0=56, w_a=35.73, w_m=2.28, + group_width=16, **kwargs) return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs) @@ -483,8 +506,8 @@ def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24, - use_se=False, **kwargs) + params = BlockParams.from_init_params(depth=18, w_0=80, w_a=34.01, w_m=2.25, + group_width=24, **kwargs) return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs) @@ -497,8 +520,8 @@ def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48, - use_se=False, **kwargs) + params = BlockParams.from_init_params(depth=25, w_0=88, w_a=26.31, w_m=2.25, + group_width=48, **kwargs) return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs) @@ -511,8 +534,8 @@ def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120, - use_se=False, **kwargs) + params = BlockParams.from_init_params(depth=23, w_0=80, w_a=49.56, w_m=2.88, + group_width=120, **kwargs) return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs) @@ -525,8 +548,8 @@ def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128, - use_se=False, **kwargs) + params = BlockParams.from_init_params(depth=22, w_0=216, w_a=55.59, w_m=2.1, + group_width=128, **kwargs) return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs) @@ -539,8 +562,8 @@ def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any pretrained (bool): If True, returns a model pre-trained on ImageNet progress (bool): If True, displays a progress bar of the download to stderr """ - params = BlockParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, - use_se=False, **kwargs) + params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0, + group_width=168, **kwargs) return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs) # TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF From 3957d5dd5797849ce8a40d7daa2aa1db55795664 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 22 Sep 2021 22:37:08 +0000 Subject: [PATCH 36/40] fuse into ConvBNActivation --- torchvision/models/regnet.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 7b9864b96f9..9a5674f6abe 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -68,8 +68,8 @@ def __init__( activation=activation_layer, ) - layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False) - layers["final_bn"] = norm_layer(width_out) + layers["c"] = ConvBNActivation(w_b, width_out, kernel_size=1, stride=1, + norm_layer=norm_layer, activation_layer=nn.Identity) super().__init__(layers) @@ -92,10 +92,8 @@ def __init__( # Use skip connection with projection if shape changes self.proj_block = (width_in != width_out) or (stride != 1) if self.proj_block: - self.proj = nn.Conv2d( - width_in, width_out, 1, stride=stride, padding=0, bias=False - ) - self.bn = norm_layer(width_out) + self.proj = ConvBNActivation(width_in, width_out, kernel_size=1, + stride=stride, norm_layer=norm_layer, activation_layer=nn.Identity) self.f = BottleneckTransform( width_in, width_out, @@ -110,7 +108,7 @@ def __init__( def forward(self, x: Tensor) -> Tensor: if self.proj_block: - x = self.bn(self.proj(x)) + self.f(x) + x = self.proj(x) + self.f(x) else: x = x + self.f(x) return self.activation(x) From 208f045fa6fb6301dc76ec6c2d0399c7782c98c2 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 22 Sep 2021 22:44:24 +0000 Subject: [PATCH 37/40] make reset_parameters private --- torchvision/models/regnet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index 9a5674f6abe..d18057641a1 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -332,7 +332,7 @@ def __init__( self.fc = nn.Linear(in_features=current_width, out_features=num_classes) # Init weights and good to go - self.reset_parameters() + self._reset_parameters() def forward(self, x: Tensor) -> Tensor: x = self.stem(x) @@ -344,7 +344,7 @@ def forward(self, x: Tensor) -> Tensor: return x - def reset_parameters(self) -> None: + def _reset_parameters(self) -> None: # Performs ResNet-style weight initialization for m in self.modules(): if isinstance(m, nn.Conv2d): From f78a27fb2b1a70fdbf3e9a74171380d945346bc5 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 22 Sep 2021 23:56:16 +0000 Subject: [PATCH 38/40] fix type errors --- torchvision/models/regnet.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index d18057641a1..f4fd2702f93 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -49,7 +49,7 @@ def __init__( bottleneck_multiplier: float, se_ratio: Optional[float], ) -> None: - layers = OrderedDict() + layers: OrderedDict[str, nn.Module] = OrderedDict() w_b = int(round(width_out * bottleneck_multiplier)) g = w_b // group_width @@ -154,7 +154,7 @@ def __init__( depths: List[int], widths: List[int], group_widths: List[int], - bottleneck_multipliers: List[int], + bottleneck_multipliers: List[float], strides: List[int], se_ratio: Optional[float] = None, ) -> None: @@ -208,9 +208,8 @@ def from_init_params( block_widths = ( torch.round(torch.divide(w_0 * torch.pow(w_m, block_capacity), QUANT)) * QUANT - ).int() - num_stages = len(torch.unique(block_widths)) - block_widths = block_widths.tolist() + ).int().tolist() + num_stages = len(set(block_widths)) # Convert to per stage parameters split_helper = zip( @@ -222,7 +221,7 @@ def from_init_params( splits = [w != wp or r != rp for w, wp, r, rp in split_helper] stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t] - stage_depths = torch.diff(torch.Tensor([d for d, t in enumerate(splits) if t])).int().tolist() + stage_depths = torch.diff(torch.tensor([d for d, t in enumerate(splits) if t])).int().tolist() strides = [STRIDE] * num_stages bottleneck_multipliers = [bottleneck_multiplier] * num_stages From f59ea8c9b262290c6847a2da33d23b7c2e40b9c0 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Thu, 23 Sep 2021 05:13:47 +0000 Subject: [PATCH 39/40] fix for unit test --- torchvision/models/regnet.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index f4fd2702f93..f18055f80b4 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -15,6 +15,13 @@ from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible from torchvision.models.efficientnet import SqueezeExcitation + +__all__ = ["RegNet", "regnet_y_400mf", "regnet_y_800mf", "regnet_y_1_6gf", + "regnet_y_3_2gf", "regnet_y_8gf", "regnet_y_16gf", "regnet_y_32gf", + "regnet_x_400mf", "regnet_x_800mf", "regnet_x_1_6gf", "regnet_x_3_2gf", + "regnet_x_8gf", "regnet_x_16gf", "regnet_x_32gf"] + + model_urls = { # TODO(kazhang): add pretrained weights "regnet_y_400m": "", @@ -90,8 +97,9 @@ def __init__( super().__init__() # Use skip connection with projection if shape changes - self.proj_block = (width_in != width_out) or (stride != 1) - if self.proj_block: + self.proj = None + should_proj = (width_in != width_out) or (stride != 1) + if should_proj: self.proj = ConvBNActivation(width_in, width_out, kernel_size=1, stride=stride, norm_layer=norm_layer, activation_layer=nn.Identity) self.f = BottleneckTransform( @@ -107,7 +115,7 @@ def __init__( self.activation = activation_layer(inplace=True) def forward(self, x: Tensor) -> Tensor: - if self.proj_block: + if self.proj is not None: x = self.proj(x) + self.f(x) else: x = x + self.f(x) From b0325b6db1cf69011def768aa3f86229c7dddf31 Mon Sep 17 00:00:00 2001 From: Kai Zhang Date: Wed, 29 Sep 2021 00:06:05 +0000 Subject: [PATCH 40/40] add pretrained weights for 6 variant models, update docs --- docs/source/models.rst | 16 +++++++--------- references/classification/README.md | 26 +++++++++++++++++++++++++- torchvision/models/regnet.py | 8 ++++++-- 3 files changed, 38 insertions(+), 12 deletions(-) diff --git a/docs/source/models.rst b/docs/source/models.rst index be2a007d9ae..ef9c326ade4 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -111,18 +111,10 @@ These can be constructed by passing ``pretrained=True``: efficientnet_b7 = models.efficientnet_b7(pretrained=True) regnet_y_400mf = models.regnet_y_400mf(pretrained=True) regnet_y_800mf = models.regnet_y_800mf(pretrained=True) - regnet_y_1_6gf = models.regnet_y_1_6gf(pretrained=True) - regnet_y_3_2gf = models.regnet_y_3_2gf(pretrained=True) regnet_y_8gf = models.regnet_y_8gf(pretrained=True) - regnet_y_16gf = models.regnet_y_16gf(pretrained=True) - regnet_y_32gf = models.regnet_y_32gf(pretrained=True) regnet_x_400mf = models.regnet_x_400mf(pretrained=True) regnet_x_800mf = models.regnet_x_800mf(pretrained=True) - regnet_x_1_6gf = models.regnet_x_1_6gf(pretrained=True) - regnet_x_3_2gf = models.regnet_x_3_2gf(pretrained=True) regnet_x_8gf = models.regnet_x_8gf(pretrained=True) - regnet_x_16gf = models.regnet_x_16gf(pretrained=True) - regnet_x_32gf = models.regnet_x_32gf(pretrained=True) Instancing a pre-trained model will download its weights to a cache directory. This directory can be set using the `TORCH_MODEL_ZOO` environment variable. See @@ -217,6 +209,12 @@ EfficientNet-B4 83.384 96.594 EfficientNet-B5 83.444 96.628 EfficientNet-B6 84.008 96.916 EfficientNet-B7 84.122 96.908 +regnet_x_400mf 72.834 90.950 +regnet_x_800mf 75.190 92.418 +regnet_x_8gf 79.324 94.694 +regnet_y_400mf 74.024 91.680 +regnet_y_800mf 76.420 93.136 +regnet_y_8gf 79.966 95.100 ================================ ============= ============= @@ -347,7 +345,7 @@ EfficientNet .. autofunction:: efficientnet_b6 .. autofunction:: efficientnet_b7 -EfficientNet +RegNet ------------ .. autofunction:: regnet_y_400mf diff --git a/references/classification/README.md b/references/classification/README.md index 5d945d2728d..cc328f0f259 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -81,9 +81,33 @@ The weights of the B5-B7 variants are ported from Luke Melas' [EfficientNet-PyTo ### RegNet + +#### Small models +``` +torchrun --nproc_per_node=8 train.py\ + --model $MODEL --epochs 100 --batch-size 128 --wd 0.00005 --lr=0.8\ + --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\ + --lr-warmup-epochs=5 --lr-warmup-decay=0.1 +``` +Here `$MODEL` is one of `regnet_x_400mf`, `regnet_x_800mf`, `regnet_x_1_6gf`, `regnet_y_400mf`, `regnet_y_800mf` and `regnet_y_1_6gf`. Please note we used learning rate 0.4 for `regent_y_400mf` to get the same Acc@1 as [the paper)(https://arxiv.org/abs/2003.13678). + +### Medium models ``` -torchrun --nproc_per_node=8 train.py --model regnet_y_400mf --epochs 100 --batch-size 128 +torchrun --nproc_per_node=8 train.py\ + --model $MODEL --epochs 100 --batch-size 64 --wd 0.00005 --lr=0.4\ + --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\ + --lr-warmup-epochs=5 --lr-warmup-decay=0.1 +``` +Here `$MODEL` is one of `regnet_x_3_2gf`, `regnet_x_8gf`, `regnet_x_16gf`, `regnet_y_3_2gf` and `regnet_y_8gf`. + +### Large models +``` +torchrun --nproc_per_node=8 train.py\ + --model $MODEL --epochs 100 --batch-size 32 --wd 0.00005 --lr=0.2\ + --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\ + --lr-warmup-epochs=5 --lr-warmup-decay=0.1 ``` +Here `$MODEL` is one of `regnet_x_32gf`, `regnet_y_16gf` and `regnet_y_32gf`. ## Mixed precision training Automatic Mixed Precision (AMP) training on GPU for Pytorch can be enabled with the [NVIDIA Apex extension](https://github.com/NVIDIA/apex). diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py index f18055f80b4..bbab59c4074 100644 --- a/torchvision/models/regnet.py +++ b/torchvision/models/regnet.py @@ -23,8 +23,12 @@ model_urls = { - # TODO(kazhang): add pretrained weights - "regnet_y_400m": "", + "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-540e987b.pth", + "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth", + "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-49ff86b5.pth", + "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth", + "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-5cb79b7e.pth", + "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-4c4e575e.pth", }