From 9d3f0b1afd78f454f5530dac5bde9a6e8a34c118 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 00:15:55 +0000
Subject: [PATCH 01/40] initial code

---
 torchvision/models/regnet.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)
 create mode 100644 torchvision/models/regnet.py

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
new file mode 100644
index 00000000000..96493ae3c4c
--- /dev/null
+++ b/torchvision/models/regnet.py
@@ -0,0 +1,13 @@
+from torch import nn
+
+class RegNetParams:
+    pass
+
+class SqueezeExcitation(nn.Module):
+    pass
+
+class RegNet(nn.Module):
+    pass
+
+def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    pass

From e797fcab7074c7aaabed4726cdf9e098db9d317f Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 05:08:36 +0000
Subject: [PATCH 02/40] add SqueezeExcitation

---
 torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 96493ae3c4c..eff8273695a 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,10 +1,46 @@
-from torch import nn
+from torch import nn, Tenspr
 
 class RegNetParams:
     pass
 
 class SqueezeExcitation(nn.Module):
-    pass
+    """
+    Squeeze and excitation layer from 
+    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        reduction_ratio: Optional[int] = 16,
+        reduced_channels: Optional[int] = None,
+        activation: Optional[nn.Module] = None,
+    ) -> None:
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        # Either reduction_ratio is defined, or out_channels is defined,
+        # neither both nor none of them
+        assert bool(reduction_ratio) != bool(reduced_channels)
+
+        if activation is None:
+            activation = nn.ReLU()
+
+        reduced_channels = (
+            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
+        )
+        self.excitation = nn.Sequential(
+            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
+            activation,
+            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x_squeezed = self.avgpool(x)
+        x_excited = self.excitation(x_squeezed)
+        x_scaled = x * x_excited
+        return x_scaled
 
 class RegNet(nn.Module):
     pass

From 692fbaaa536bd50436c66efb5a2a8623c1d74285 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 00:15:55 +0000
Subject: [PATCH 03/40] initial code

---
 torchvision/models/regnet.py | 40 ++----------------------------------
 1 file changed, 2 insertions(+), 38 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index eff8273695a..96493ae3c4c 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,46 +1,10 @@
-from torch import nn, Tenspr
+from torch import nn
 
 class RegNetParams:
     pass
 
 class SqueezeExcitation(nn.Module):
-    """
-    Squeeze and excitation layer from 
-    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        reduction_ratio: Optional[int] = 16,
-        reduced_channels: Optional[int] = None,
-        activation: Optional[nn.Module] = None,
-    ) -> None:
-        super().__init__()
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-
-        # Either reduction_ratio is defined, or out_channels is defined,
-        # neither both nor none of them
-        assert bool(reduction_ratio) != bool(reduced_channels)
-
-        if activation is None:
-            activation = nn.ReLU()
-
-        reduced_channels = (
-            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
-        )
-        self.excitation = nn.Sequential(
-            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
-            activation,
-            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
-            nn.Sigmoid(),
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        x_squeezed = self.avgpool(x)
-        x_excited = self.excitation(x_squeezed)
-        x_scaled = x * x_excited
-        return x_scaled
+    pass
 
 class RegNet(nn.Module):
     pass

From eb6fb9f28fca11fbef5b633607ae1b71729eed8f Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 05:08:36 +0000
Subject: [PATCH 04/40] add SqueezeExcitation

---
 torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 96493ae3c4c..eff8273695a 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,10 +1,46 @@
-from torch import nn
+from torch import nn, Tenspr
 
 class RegNetParams:
     pass
 
 class SqueezeExcitation(nn.Module):
-    pass
+    """
+    Squeeze and excitation layer from 
+    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        reduction_ratio: Optional[int] = 16,
+        reduced_channels: Optional[int] = None,
+        activation: Optional[nn.Module] = None,
+    ) -> None:
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        # Either reduction_ratio is defined, or out_channels is defined,
+        # neither both nor none of them
+        assert bool(reduction_ratio) != bool(reduced_channels)
+
+        if activation is None:
+            activation = nn.ReLU()
+
+        reduced_channels = (
+            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
+        )
+        self.excitation = nn.Sequential(
+            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
+            activation,
+            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x_squeezed = self.avgpool(x)
+        x_excited = self.excitation(x_squeezed)
+        x_scaled = x * x_excited
+        return x_scaled
 
 class RegNet(nn.Module):
     pass

From 88840c30bf42782a56da831848d57da0bdb9c785 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Fri, 10 Sep 2021 23:21:17 +0000
Subject: [PATCH 05/40] add SqueezeExcitation

---
 torchvision/models/regnet.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index eff8273695a..aa41e3096c2 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,11 +1,18 @@
 from torch import nn, Tenspr
+from torchvision.models.mobilenetv2 import _make_divisible
+
+
+model_urls = {
+}
+
 
 class RegNetParams:
     pass
 
+
 class SqueezeExcitation(nn.Module):
     """
-    Squeeze and excitation layer from 
+    Squeeze and excitation layer from
     `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
     """
 
@@ -42,8 +49,20 @@ def forward(self, x: Tensor) -> Tensor:
         x_scaled = x * x_excited
         return x_scaled
 
+
 class RegNet(nn.Module):
     pass
 
+
+def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
+    model = RegNet()
+    if pretrained:
+        if arch not in model_urls:
+            raise ValueError(f"No checkpoint is available for model type {arch}")
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
 def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    pass
+    return _regnet("regnet_y_400mf", pretrained, progress, **kwargs)

From 8bde15a60050a9d1a44520de9050338655609b8d Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Tue, 14 Sep 2021 05:57:30 +0000
Subject: [PATCH 06/40] regnet blocks, stems and model definition

---
 torchvision/models/__init__.py |   1 +
 torchvision/models/regnet.py   | 563 ++++++++++++++++++++++++++++++++-
 2 files changed, 557 insertions(+), 7 deletions(-)

diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py
index e57f4773c8c..07ccf8de7f5 100644
--- a/torchvision/models/__init__.py
+++ b/torchvision/models/__init__.py
@@ -9,6 +9,7 @@
 from .mnasnet import *
 from .shufflenetv2 import *
 from .efficientnet import *
+from .regnet import *
 from . import segmentation
 from . import detection
 from . import video
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index aa41e3096c2..eb15da6e0ad 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,4 +1,11 @@
-from torch import nn, Tenspr
+import numpy as np
+import math
+import torch
+
+from collections import OrderedDict
+from enum import Enum, auto
+from typing import Any, List, Optional
+from torch import nn, Tensor
 from torchvision.models.mobilenetv2 import _make_divisible
 
 
@@ -6,11 +13,140 @@
 }
 
 
+# The different possible blocks
+class BlockType(Enum):
+    VANILLA_BLOCK = auto()
+    RES_BASIC_BLOCK = auto()
+    RES_BOTTLENECK_BLOCK = auto()
+    RES_BOTTLENECK_LINEAR_BLOCK = auto()
+
+
+# The different possible Stems
+class StemType(Enum):
+    RES_STEM_CIFAR = auto()
+    RES_STEM_IN = auto()
+    SIMPLE_STEM_IN = auto()
+
+
+# The different possible activations
+class ActivationType(Enum):
+    RELU = auto()
+    SILU = auto()
+
+
 class RegNetParams:
-    pass
+    def __init__(
+        self,
+        depth: int,
+        w_0: int,
+        w_a: float,
+        w_m: float,
+        group_width: int,
+        bottleneck_multiplier: float = 1.0,
+        stem_type: StemType = StemType.SIMPLE_STEM_IN,
+        stem_width: int = 32,
+        block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK,
+        activation: ActivationType = ActivationType.RELU,
+        use_se: bool = True,
+        se_ratio: float = 0.25,
+        bn_epsilon: float = 1e-05,
+        bn_momentum: bool = 0.1,
+    ) -> None:
+        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
+            raise ValueError("Invalid RegNet settings")
+        self.depth = depth
+        self.w_0 = w_0
+        self.w_a = w_a
+        self.w_m = w_m
+        self.group_width = group_width
+        self.bottleneck_multiplier = bottleneck_multiplier
+        self.stem_type = stem_type
+        self.block_type = block_type
+        self.activation = activation
+        self.stem_width = stem_width
+        self.use_se = use_se
+        self.se_ratio = se_ratio if use_se else None
+        self.bn_epsilon = bn_epsilon
+        self.bn_momentum = bn_momentum
+
+    def get_expanded_params(self):
+        """
+        Programatically compute all the per-block settings,
+        given the RegNet parameters.
+
+        The first step is to compute the quantized linear block parameters,
+        in log space. Key parameters are:
+        - `w_a` is the width progression slope
+        - `w_0` is the initial width
+        - `w_m` is the width stepping in the log space
+
+        In other terms
+        `log(block_width) = log(w_0) + w_m * block_capacity`,
+        with `bock_capacity` ramping up following the w_0 and w_a params.
+        This block width is finally quantized to multiples of 8.
+
+        The second step is to compute the parameters per stage,
+        taking into account the skip connection and the final 1x1 convolutions.
+        We use the fact that the output width is constant within a stage.
+        """
+
+        QUANT = 8
+        STRIDE = 2
+
+        # Compute the block widths. Each stage has one unique block width
+        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
+        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
+        block_widths = (
+            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
+            * QUANT
+        )
+        num_stages = len(np.unique(block_widths))
+        block_widths = block_widths.astype(int).tolist()
+
+        # Convert to per stage parameters
+        split_helper = zip(
+            block_widths + [0],
+            [0] + block_widths,
+            block_widths + [0],
+            [0] + block_widths,
+        )
+        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
+
+        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
+        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
 
+        strides = [STRIDE] * num_stages
+        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
+        group_widths = [self.group_width] * num_stages
 
-class SqueezeExcitation(nn.Module):
+        # Adjust the compatibility of stage widths and group widths
+        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
+            stage_widths, bottleneck_multipliers, group_widths
+        )
+
+        return zip(
+            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
+        )
+
+    @staticmethod
+    def _adjust_widths_groups_compatibilty(
+            stage_widths: List[int], bottleneck_ratios: List[float],
+            group_widths: List[int]) -> Tuple(List[int], List[int]):
+        """
+        Adjusts the compatibility of widths and groups,
+        depending on the bottleneck ratio.
+        """
+        # Compute all widths for the current settings
+        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
+        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
+
+        # Compute the adjusted widths so that stage and group widths fit
+        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
+        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
+        return stage_widths, group_widths_min
+
+
+class _SqueezeExcitation(nn.Module):
     """
     Squeeze and excitation layer from
     `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
@@ -50,12 +186,424 @@ def forward(self, x: Tensor) -> Tensor:
         return x_scaled
 
 
+class BasicTransform(nn.Sequential):
+    """Basic transformation: [3x3 conv, BN, Relu] x2."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+
+        self.a = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
+        )
+
+        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.depth = 2
+
+
+class ResStemCifar(nn.Sequential):
+    """ResNet stem for CIFAR: 3x3, BN, ReLU."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+        self.stem = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+        self.depth = 2
+
+
+class ResStemIN(nn.Sequential):
+    """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+        self.stem = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+            nn.MaxPool2d(3, stride=2, padding=1),
+        )
+        self.depth = 3
+
+
+class SimpleStemIN(nn.Sequential):
+    """Simple stem for ImageNet: 3x3, BN, ReLU."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+        self.stem = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+        self.depth = 2
+
+
+class VanillaBlock(nn.Sequential):
+    """Vanilla block: [3x3 conv, BN, Relu] x2."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.a = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        self.b = nn.Sequential(
+            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        self.depth = 2
+
+
+class ResBasicBlock(nn.Module):
+    """Residual basic block: x + F(x), F = basic transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.proj_block = (width_in != width_out) or (stride != 1)
+        if self.proj_block:
+            self.proj = nn.Conv2d(
+                width_in, width_out, 1, stride=stride, padding=0, bias=False
+            )
+            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.f = BasicTransform(
+            width_in, width_out, stride, bn_epsilon, bn_momentum, activation
+        )
+        self.activation = activation
+
+        # The projection and transform happen in parallel,
+        # and ReLU is not counted with respect to depth
+        self.depth = self.f.depth
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.proj_block:
+            x = self.bn(self.proj(x)) + self.f(x)
+        else:
+            x = x + self.f(x)
+
+        return self.activation(x)
+
+
+class BottleneckTransform(nn.Sequential):
+    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        group_width: int,
+        bottleneck_multiplier: float,
+        se_ratio: Optional[float],
+    ) -> None:
+        super().__init__()
+        w_b = int(round(width_out * bottleneck_multiplier))
+        g = w_b // group_width
+
+        self.a = nn.Sequential(
+            nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        self.b = nn.Sequential(
+            nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False),
+            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        if se_ratio:
+            # The SE reduction ratio is defined with respect to the
+            # beginning of the block
+            width_se_out = int(round(se_ratio * width_in))
+            self.se = _SqueezeExcitation(
+                in_channels=w_b,
+                reduction_ratio=None,
+                reduced_channels=width_se_out,
+                activation=activation,
+            )
+
+        self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
+        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.depth = 3 if not se_ratio else 4
+
+
+class ResBottleneckBlock(nn.Module):
+    """Residual bottleneck block: x + F(x), F = bottleneck transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        group_width: int = 1,
+        bottleneck_multiplier: float = 1.0,
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+
+        # Use skip connection with projection if shape changes
+        self.proj_block = (width_in != width_out) or (stride != 1)
+        if self.proj_block:
+            self.proj = nn.Conv2d(
+                width_in, width_out, 1, stride=stride, padding=0, bias=False
+            )
+            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.f = BottleneckTransform(
+            width_in,
+            width_out,
+            stride,
+            bn_epsilon,
+            bn_momentum,
+            activation,
+            group_width,
+            bottleneck_multiplier,
+            se_ratio,
+        )
+        self.activation = activation
+
+        # The projection and transform happen in parallel,
+        # and activation is not counted with respect to depth
+        self.depth = self.f.depth
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.proj_block:
+            x = self.bn(self.proj(x)) + self.f(x)
+        else:
+            x = x + self.f(x)
+        return self.activation(x)
+
+
+class ResBottleneckLinearBlock(nn.Module):
+    """Residual linear bottleneck block: x + F(x), F = bottleneck transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        group_width: int = 1,
+        bottleneck_multiplier: float = 4.0,
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+        self.has_skip = (width_in == width_out) and (stride == 1)
+        self.f = BottleneckTransform(
+            width_in,
+            width_out,
+            stride,
+            bn_epsilon,
+            bn_momentum,
+            activation,
+            group_width,
+            bottleneck_multiplier,
+            se_ratio,
+        )
+
+        self.depth = self.f.depth
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.f(x) if self.has_skip else self.f(x)
+
+
+class AnyStage(nn.Sequential):
+    """AnyNet stage (sequence of blocks w/ the same output shape)."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        depth: int,
+        block_constructor: nn.Module,
+        activation: nn.Module,
+        group_width: int,
+        bottleneck_multiplier: float,
+        params: "AnyNetParams",
+        stage_index: int = 0,
+    ) -> None:
+        super().__init__()
+        self.stage_depth = 0
+
+        for i in range(depth):
+            block = block_constructor(
+                width_in if i == 0 else width_out,
+                width_out,
+                stride if i == 0 else 1,
+                params.bn_epsilon,
+                params.bn_momentum,
+                activation,
+                group_width,
+                bottleneck_multiplier,
+                params.se_ratio,
+            )
+
+            self.stage_depth += block.depth
+            self.add_module(f"block{stage_index}-{i}", block)
+
+
 class RegNet(nn.Module):
-    pass
+    def __init__(self, params: RegNetParams) -> None:
+        super().__init__()
+
+        if params.activation == ActivationType.SILU and torch.__version__ < "1.7":
+            raise ValueError("SiLU activation is only supported since PyTorch 1.7")
+
+        silu = None if torch.__version__ < "1.7" else nn.SiLU()
+        activation = {
+            ActivationType.RELU: nn.ReLU(inplace=True),
+            ActivationType.SILU: silu,
+        }[params.activation]
+
+        # Ad hoc stem
+        self.stem = {
+            StemType.RES_STEM_CIFAR: ResStemCifar,
+            StemType.RES_STEM_IN: ResStemIN,
+            StemType.SIMPLE_STEM_IN: SimpleStemIN,
+        }[params.stem_type](
+            3,  # width_in
+            params.stem_width,
+            params.bn_epsilon,
+            params.bn_momentum,
+            activation,
+        )
+
+        # Instantiate all the AnyNet blocks in the trunk
+        block_fun = {
+            BlockType.VANILLA_BLOCK: VanillaBlock,
+            BlockType.RES_BASIC_BLOCK: ResBasicBlock,
+            BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock,
+            BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock,
+        }[params.block_type]
+
+        current_width = params.stem_width
+
+        self.trunk_depth = 0
+
+        blocks = []
+        for i, (
+            width_out,
+            stride,
+            depth,
+            group_width,
+            bottleneck_multiplier,
+        ) in enumerate(params.get_expanded_params()):
+            blocks.append(
+                (
+                    f"block{i+1}",
+                    AnyStage(
+                        current_width,
+                        width_out,
+                        stride,
+                        depth,
+                        block_fun,
+                        activation,
+                        group_width,
+                        bottleneck_multiplier,
+                        params,
+                        stage_index=i + 1,
+                    ),
+                )
+            )
+
+            self.trunk_depth += blocks[-1][1].stage_depth
+
+            current_width = width_out
+
+        self.trunk_output = nn.Sequential(OrderedDict(blocks))
+
+        # Init weights and good to go
+        self._init_weights()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        x = self.trunk_output(x)
+   
+        return x
+
+    def _init_weights(self) -> None:
+        # Performs ResNet-style weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # Note that there is no bias due to BN
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1.0)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                m.weight.data.normal_(mean=0.0, std=0.01)
+                m.bias.data.zero_()
 
 
-def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
-    model = RegNet()
+def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
+    model = RegNet(params)
     if pretrained:
         if arch not in model_urls:
             raise ValueError(f"No checkpoint is available for model type {arch}")
@@ -65,4 +613,5 @@ def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNe
 
 
 def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    return _regnet("regnet_y_400mf", pretrained, progress, **kwargs)
+    params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
+    return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)

From 56352a034748a9b545b54df8b95909eb57b65199 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Tue, 14 Sep 2021 06:21:00 +0000
Subject: [PATCH 07/40] nit

---
 torchvision/models/regnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index eb15da6e0ad..80fed6d6a6e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -4,7 +4,7 @@
 
 from collections import OrderedDict
 from enum import Enum, auto
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple
 from torch import nn, Tensor
 from torchvision.models.mobilenetv2 import _make_divisible
 
@@ -131,7 +131,7 @@ def get_expanded_params(self):
     @staticmethod
     def _adjust_widths_groups_compatibilty(
             stage_widths: List[int], bottleneck_ratios: List[float],
-            group_widths: List[int]) -> Tuple(List[int], List[int]):
+            group_widths: List[int]) -> Tuple[List[int], List[int]]:
         """
         Adjusts the compatibility of widths and groups,
         depending on the bottleneck ratio.

From ce181c3ef97facacdf6841b87cda208c2e99fbb7 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Tue, 14 Sep 2021 23:06:53 +0000
Subject: [PATCH 08/40] add fc layer

---
 torchvision/models/regnet.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 80fed6d6a6e..5d813d59b9e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -51,6 +51,7 @@ def __init__(
         se_ratio: float = 0.25,
         bn_epsilon: float = 1e-05,
         bn_momentum: bool = 0.1,
+        num_classes: int = 1000,
     ) -> None:
         if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
             raise ValueError("Invalid RegNet settings")
@@ -68,6 +69,7 @@ def __init__(
         self.se_ratio = se_ratio if use_se else None
         self.bn_epsilon = bn_epsilon
         self.bn_momentum = bn_momentum
+        self.num_classes = num_classes
 
     def get_expanded_params(self):
         """
@@ -578,13 +580,20 @@ def __init__(self, params: RegNetParams) -> None:
 
         self.trunk_output = nn.Sequential(OrderedDict(blocks))
 
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes)
+
         # Init weights and good to go
         self._init_weights()
 
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
         x = self.trunk_output(x)
-   
+
+        x = self.avgpool(x)
+        x = x.flatten(start_dim=1)
+        x = self.fc(x)
+
         return x
 
     def _init_weights(self) -> None:

From a91c32b309334e4e4e4d9084df858c45d45a607d Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 15 Sep 2021 07:19:45 +0000
Subject: [PATCH 09/40] use Callable instead of Enum for block, stem and
 activation

---
 torchvision/models/regnet.py | 296 ++++++++++++++++-------------------
 1 file changed, 132 insertions(+), 164 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 5d813d59b9e..76d3381fe5b 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -4,150 +4,19 @@
 
 from collections import OrderedDict
 from enum import Enum, auto
-from typing import Any, List, Optional, Tuple
+from typing import Any, Callable, List, Optional, Tuple
 from torch import nn, Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
 from torchvision.models.mobilenetv2 import _make_divisible
 
 
 model_urls = {
+    # TODO(kazhang): add pretrained weights
+    "regnet_y_400m": "",
 }
 
 
-# The different possible blocks
-class BlockType(Enum):
-    VANILLA_BLOCK = auto()
-    RES_BASIC_BLOCK = auto()
-    RES_BOTTLENECK_BLOCK = auto()
-    RES_BOTTLENECK_LINEAR_BLOCK = auto()
-
-
-# The different possible Stems
-class StemType(Enum):
-    RES_STEM_CIFAR = auto()
-    RES_STEM_IN = auto()
-    SIMPLE_STEM_IN = auto()
-
-
-# The different possible activations
-class ActivationType(Enum):
-    RELU = auto()
-    SILU = auto()
-
-
-class RegNetParams:
-    def __init__(
-        self,
-        depth: int,
-        w_0: int,
-        w_a: float,
-        w_m: float,
-        group_width: int,
-        bottleneck_multiplier: float = 1.0,
-        stem_type: StemType = StemType.SIMPLE_STEM_IN,
-        stem_width: int = 32,
-        block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK,
-        activation: ActivationType = ActivationType.RELU,
-        use_se: bool = True,
-        se_ratio: float = 0.25,
-        bn_epsilon: float = 1e-05,
-        bn_momentum: bool = 0.1,
-        num_classes: int = 1000,
-    ) -> None:
-        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
-            raise ValueError("Invalid RegNet settings")
-        self.depth = depth
-        self.w_0 = w_0
-        self.w_a = w_a
-        self.w_m = w_m
-        self.group_width = group_width
-        self.bottleneck_multiplier = bottleneck_multiplier
-        self.stem_type = stem_type
-        self.block_type = block_type
-        self.activation = activation
-        self.stem_width = stem_width
-        self.use_se = use_se
-        self.se_ratio = se_ratio if use_se else None
-        self.bn_epsilon = bn_epsilon
-        self.bn_momentum = bn_momentum
-        self.num_classes = num_classes
-
-    def get_expanded_params(self):
-        """
-        Programatically compute all the per-block settings,
-        given the RegNet parameters.
-
-        The first step is to compute the quantized linear block parameters,
-        in log space. Key parameters are:
-        - `w_a` is the width progression slope
-        - `w_0` is the initial width
-        - `w_m` is the width stepping in the log space
-
-        In other terms
-        `log(block_width) = log(w_0) + w_m * block_capacity`,
-        with `bock_capacity` ramping up following the w_0 and w_a params.
-        This block width is finally quantized to multiples of 8.
-
-        The second step is to compute the parameters per stage,
-        taking into account the skip connection and the final 1x1 convolutions.
-        We use the fact that the output width is constant within a stage.
-        """
-
-        QUANT = 8
-        STRIDE = 2
-
-        # Compute the block widths. Each stage has one unique block width
-        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
-        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
-        block_widths = (
-            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
-            * QUANT
-        )
-        num_stages = len(np.unique(block_widths))
-        block_widths = block_widths.astype(int).tolist()
-
-        # Convert to per stage parameters
-        split_helper = zip(
-            block_widths + [0],
-            [0] + block_widths,
-            block_widths + [0],
-            [0] + block_widths,
-        )
-        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
-
-        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
-        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
-
-        strides = [STRIDE] * num_stages
-        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
-        group_widths = [self.group_width] * num_stages
-
-        # Adjust the compatibility of stage widths and group widths
-        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
-            stage_widths, bottleneck_multipliers, group_widths
-        )
-
-        return zip(
-            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
-        )
-
-    @staticmethod
-    def _adjust_widths_groups_compatibilty(
-            stage_widths: List[int], bottleneck_ratios: List[float],
-            group_widths: List[int]) -> Tuple[List[int], List[int]]:
-        """
-        Adjusts the compatibility of widths and groups,
-        depending on the bottleneck ratio.
-        """
-        # Compute all widths for the current settings
-        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
-        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
-
-        # Compute the adjusted widths so that stage and group widths fit
-        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
-        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
-        return stage_widths, group_widths_min
-
-
 class _SqueezeExcitation(nn.Module):
     """
     Squeeze and excitation layer from
@@ -483,11 +352,13 @@ def __init__(
         width_out: int,
         stride: int,
         depth: int,
-        block_constructor: nn.Module,
+        block_constructor: Callable[..., nn.Module],
+        bn_epsilon: float,
+        bn_momentum: float,
         activation: nn.Module,
         group_width: int,
         bottleneck_multiplier: float,
-        params: "AnyNetParams",
+        se_ratio: Optional[float] = None,
         stage_index: int = 0,
     ) -> None:
         super().__init__()
@@ -498,37 +369,140 @@ def __init__(
                 width_in if i == 0 else width_out,
                 width_out,
                 stride if i == 0 else 1,
-                params.bn_epsilon,
-                params.bn_momentum,
+                bn_epsilon,
+                bn_momentum,
                 activation,
                 group_width,
                 bottleneck_multiplier,
-                params.se_ratio,
+                se_ratio,
             )
 
             self.stage_depth += block.depth
             self.add_module(f"block{stage_index}-{i}", block)
 
 
+class RegNetParams:
+    def __init__(
+        self,
+        depth: int,
+        w_0: int,
+        w_a: float,
+        w_m: float,
+        group_width: int,
+        bottleneck_multiplier: float = 1.0,
+        stem_type: Callable[..., nn.Module] = SimpleStemIN,
+        stem_width: int = 32,
+        block_type: Callable[..., nn.Module] = ResBottleneckBlock,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+        use_se: bool = True,
+        se_ratio: float = 0.25,
+        bn_epsilon: float = 1e-05,
+        bn_momentum: float = 0.1,
+        num_classes: int = 1000,
+    ) -> None:
+        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
+            raise ValueError("Invalid RegNet settings")
+        self.depth = depth
+        self.w_0 = w_0
+        self.w_a = w_a
+        self.w_m = w_m
+        self.group_width = group_width
+        self.bottleneck_multiplier = bottleneck_multiplier
+        self.stem_type = stem_type
+        self.block_type = block_type
+        self.activation = activation
+        self.stem_width = stem_width
+        self.use_se = use_se
+        self.se_ratio = se_ratio if use_se else None
+        self.bn_epsilon = bn_epsilon
+        self.bn_momentum = bn_momentum
+        self.num_classes = num_classes
+
+    def get_expanded_params(self):
+        """
+        Programatically compute all the per-block settings,
+        given the RegNet parameters.
+
+        The first step is to compute the quantized linear block parameters,
+        in log space. Key parameters are:
+        - `w_a` is the width progression slope
+        - `w_0` is the initial width
+        - `w_m` is the width stepping in the log space
+
+        In other terms
+        `log(block_width) = log(w_0) + w_m * block_capacity`,
+        with `bock_capacity` ramping up following the w_0 and w_a params.
+        This block width is finally quantized to multiples of 8.
+
+        The second step is to compute the parameters per stage,
+        taking into account the skip connection and the final 1x1 convolutions.
+        We use the fact that the output width is constant within a stage.
+        """
+
+        QUANT = 8
+        STRIDE = 2
+
+        # Compute the block widths. Each stage has one unique block width
+        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
+        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
+        block_widths = (
+            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
+            * QUANT
+        )
+        num_stages = len(np.unique(block_widths))
+        block_widths = block_widths.astype(int).tolist()
+
+        # Convert to per stage parameters
+        split_helper = zip(
+            block_widths + [0],
+            [0] + block_widths,
+            block_widths + [0],
+            [0] + block_widths,
+        )
+        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
+
+        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
+        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
+
+        strides = [STRIDE] * num_stages
+        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
+        group_widths = [self.group_width] * num_stages
+
+        # Adjust the compatibility of stage widths and group widths
+        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
+            stage_widths, bottleneck_multipliers, group_widths
+        )
+
+        return zip(
+            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
+        )
+
+    @staticmethod
+    def _adjust_widths_groups_compatibilty(
+            stage_widths: List[int], bottleneck_ratios: List[float],
+            group_widths: List[int]) -> Tuple[List[int], List[int]]:
+        """
+        Adjusts the compatibility of widths and groups,
+        depending on the bottleneck ratio.
+        """
+        # Compute all widths for the current settings
+        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
+        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
+
+        # Compute the adjusted widths so that stage and group widths fit
+        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
+        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
+        return stage_widths, group_widths_min
+
+
 class RegNet(nn.Module):
     def __init__(self, params: RegNetParams) -> None:
         super().__init__()
 
-        if params.activation == ActivationType.SILU and torch.__version__ < "1.7":
-            raise ValueError("SiLU activation is only supported since PyTorch 1.7")
-
-        silu = None if torch.__version__ < "1.7" else nn.SiLU()
-        activation = {
-            ActivationType.RELU: nn.ReLU(inplace=True),
-            ActivationType.SILU: silu,
-        }[params.activation]
+        activation = params.activation(inplace=True)
 
         # Ad hoc stem
-        self.stem = {
-            StemType.RES_STEM_CIFAR: ResStemCifar,
-            StemType.RES_STEM_IN: ResStemIN,
-            StemType.SIMPLE_STEM_IN: SimpleStemIN,
-        }[params.stem_type](
+        self.stem = params.stem_type(
             3,  # width_in
             params.stem_width,
             params.bn_epsilon,
@@ -536,14 +510,6 @@ def __init__(self, params: RegNetParams) -> None:
             activation,
         )
 
-        # Instantiate all the AnyNet blocks in the trunk
-        block_fun = {
-            BlockType.VANILLA_BLOCK: VanillaBlock,
-            BlockType.RES_BASIC_BLOCK: ResBasicBlock,
-            BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock,
-            BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock,
-        }[params.block_type]
-
         current_width = params.stem_width
 
         self.trunk_depth = 0
@@ -564,11 +530,13 @@ def __init__(self, params: RegNetParams) -> None:
                         width_out,
                         stride,
                         depth,
-                        block_fun,
+                        params.block_type,
+                        params.bn_epsilon,
+                        params.bn_momentum,
                         activation,
                         group_width,
                         bottleneck_multiplier,
-                        params,
+                        params.se_ratio,
                         stage_index=i + 1,
                     ),
                 )

From 0d1601bf6b188cc1c91f9f5bbcfc244e85910ce4 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 15 Sep 2021 21:30:52 +0000
Subject: [PATCH 10/40] add regnet_x and regnet_y model build functions, add
 docs

---
 docs/source/models.rst              |  48 +++++++
 hubconf.py                          |   4 +
 references/classification/README.md |   7 +
 torchvision/models/regnet.py        | 193 +++++++++++++++++++++++++++-
 4 files changed, 251 insertions(+), 1 deletion(-)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index 3f31455f9da..be2a007d9ae 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -37,6 +37,7 @@ architectures for image classification:
 -  `Wide ResNet`_
 -  `MNASNet`_
 -  `EfficientNet`_
+-  `RegNet`_
 
 You can construct a model with random weights by calling its constructor:
 
@@ -65,6 +66,20 @@ You can construct a model with random weights by calling its constructor:
     efficientnet_b5 = models.efficientnet_b5()
     efficientnet_b6 = models.efficientnet_b6()
     efficientnet_b7 = models.efficientnet_b7()
+    regnet_y_400mf = models.regnet_y_400mf()
+    regnet_y_800mf = models.regnet_y_800mf()
+    regnet_y_1_6gf = models.regnet_y_1_6gf()
+    regnet_y_3_2gf = models.regnet_y_3_2gf()
+    regnet_y_8gf = models.regnet_y_8gf()
+    regnet_y_16gf = models.regnet_y_16gf()
+    regnet_y_32gf = models.regnet_y_32gf()
+    regnet_x_400mf = models.regnet_x_400mf()
+    regnet_x_800mf = models.regnet_x_800mf()
+    regnet_x_1_6gf = models.regnet_x_1_6gf()
+    regnet_x_3_2gf = models.regnet_x_3_2gf()
+    regnet_x_8gf = models.regnet_x_8gf()
+    regnet_x_16gf = models.regnet_x_16gf()
+    regnet_x_32gf = models.regnet_x_32gf()
 
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -94,6 +109,20 @@ These can be constructed by passing ``pretrained=True``:
     efficientnet_b5 = models.efficientnet_b5(pretrained=True)
     efficientnet_b6 = models.efficientnet_b6(pretrained=True)
     efficientnet_b7 = models.efficientnet_b7(pretrained=True)
+    regnet_y_400mf = models.regnet_y_400mf(pretrained=True)
+    regnet_y_800mf = models.regnet_y_800mf(pretrained=True)
+    regnet_y_1_6gf = models.regnet_y_1_6gf(pretrained=True)
+    regnet_y_3_2gf = models.regnet_y_3_2gf(pretrained=True)
+    regnet_y_8gf = models.regnet_y_8gf(pretrained=True)
+    regnet_y_16gf = models.regnet_y_16gf(pretrained=True)
+    regnet_y_32gf = models.regnet_y_32gf(pretrained=True)
+    regnet_x_400mf = models.regnet_x_400mf(pretrained=True)
+    regnet_x_800mf = models.regnet_x_800mf(pretrained=True)
+    regnet_x_1_6gf = models.regnet_x_1_6gf(pretrained=True)
+    regnet_x_3_2gf = models.regnet_x_3_2gf(pretrained=True)
+    regnet_x_8gf = models.regnet_x_8gf(pretrained=True)
+    regnet_x_16gf = models.regnet_x_16gf(pretrained=True)
+    regnet_x_32gf = models.regnet_x_32gf(pretrained=True)
 
 Instancing a pre-trained model will download its weights to a cache directory.
 This directory can be set using the `TORCH_MODEL_ZOO` environment variable. See
@@ -204,6 +233,7 @@ EfficientNet-B7                   84.122          96.908
 .. _ResNeXt: https://arxiv.org/abs/1611.05431
 .. _MNASNet: https://arxiv.org/abs/1807.11626
 .. _EfficientNet: https://arxiv.org/abs/1905.11946
+.. _RegNet: https://arxiv.org/abs/2003.13678
 
 .. currentmodule:: torchvision.models
 
@@ -317,6 +347,24 @@ EfficientNet
 .. autofunction:: efficientnet_b6
 .. autofunction:: efficientnet_b7
 
+EfficientNet
+------------
+
+.. autofunction:: regnet_y_400mf
+.. autofunction:: regnet_y_800mf
+.. autofunction:: regnet_y_1_6gf
+.. autofunction:: regnet_y_3_2gf
+.. autofunction:: regnet_y_8gf
+.. autofunction:: regnet_y_16gf
+.. autofunction:: regnet_y_32gf
+.. autofunction:: regnet_x_400mf
+.. autofunction:: regnet_x_800mf
+.. autofunction:: regnet_x_1_6gf
+.. autofunction:: regnet_x_3_2gf
+.. autofunction:: regnet_x_8gf
+.. autofunction:: regnet_x_16gf
+.. autofunction:: regnet_x_32gf
+
 Quantized Models
 ----------------
 
diff --git a/hubconf.py b/hubconf.py
index 2bff6850525..8412e9e6e6b 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -17,6 +17,10 @@
     mnasnet1_3
 from torchvision.models.efficientnet import efficientnet_b0, efficientnet_b1, efficientnet_b2, \
     efficientnet_b3, efficientnet_b4, efficientnet_b5, efficientnet_b6, efficientnet_b7
+from torchvision.models.regnet import regnet_y_400mf, regnet_y_800mf, \
+    regnet_y_1_6gf, regnet_y_3_2gf, regnet_y_8gf, regnet_y_16gf, regnet_y_32gf, \
+    regnet_x_400mf, regnet_x_800mf, regnet_x_1_6gf, regnet_x_3_2gf, regnet_x_8gf, \
+    regnet_x_16gf, regnet_x_32gf
 
 # segmentation
 from torchvision.models.segmentation import fcn_resnet50, fcn_resnet101, \
diff --git a/references/classification/README.md b/references/classification/README.md
index e293f53d0ea..79149758428 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -79,6 +79,13 @@ The weights of the B0-B4 variants are ported from Ross Wightman's [timm repo](ht
 
 The weights of the B5-B7 variants are ported from Luke Melas' [EfficientNet-PyTorch repo](https://github.com/lukemelas/EfficientNet-PyTorch/blob/1039e009545d9329ea026c9f7541341439712b96/efficientnet_pytorch/utils.py#L562-L564).
 
+
+### RegNet
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
+    --model regnet_y_400mf --epochs 100
+```
+
 ## Mixed precision training
 Automatic Mixed Precision (AMP) training on GPU for Pytorch can be enabled with the [NVIDIA Apex extension](https://github.com/NVIDIA/apex).
 
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 76d3381fe5b..98c60296015 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,3 +1,8 @@
+# Modified from
+# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py
+# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+
+
 import numpy as np
 import math
 import torch
@@ -10,7 +15,6 @@
 from .._internally_replaced_utils import load_state_dict_from_url
 from torchvision.models.mobilenetv2 import _make_divisible
 
-
 model_urls = {
     # TODO(kazhang): add pretrained weights
     "regnet_y_400m": "",
@@ -590,5 +594,192 @@ def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, *
 
 
 def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_400MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
     params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
     return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_800MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs)
+    return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_1.6GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs)
+    return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_3.2GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs)
+    return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_8GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs)
+    return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_16GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs)
+    return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_32GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs)
+    return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_400MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
+                          group_width=16, use_se=False, **kwargs)
+
+    return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_800MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_1.6GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_3.2GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_8GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_16GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_32GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs)
+
+# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF

From 59c5c7e65742d364004e0571816cf341c92e6262 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 16 Sep 2021 23:25:42 +0000
Subject: [PATCH 11/40] remove unused depth

---
 torchvision/models/regnet.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 98c60296015..d405945267e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -8,7 +8,6 @@
 import torch
 
 from collections import OrderedDict
-from enum import Enum, auto
 from typing import Any, Callable, List, Optional, Tuple
 from torch import nn, Tensor
 
@@ -83,7 +82,6 @@ def __init__(
         )
 
         self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
-        self.depth = 2
 
 
 class ResStemCifar(nn.Sequential):
@@ -103,7 +101,6 @@ def __init__(
             nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
             activation,
         )
-        self.depth = 2
 
 
 class ResStemIN(nn.Sequential):
@@ -124,7 +121,6 @@ def __init__(
             activation,
             nn.MaxPool2d(3, stride=2, padding=1),
         )
-        self.depth = 3
 
 
 class SimpleStemIN(nn.Sequential):
@@ -144,7 +140,6 @@ def __init__(
             nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
             activation,
         )
-        self.depth = 2
 
 
 class VanillaBlock(nn.Sequential):
@@ -174,8 +169,6 @@ def __init__(
             activation,
         )
 
-        self.depth = 2
-
 
 class ResBasicBlock(nn.Module):
     """Residual basic block: x + F(x), F = basic transform."""
@@ -203,10 +196,6 @@ def __init__(
         )
         self.activation = activation
 
-        # The projection and transform happen in parallel,
-        # and ReLU is not counted with respect to depth
-        self.depth = self.f.depth
-
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
             x = self.bn(self.proj(x)) + self.f(x)
@@ -260,7 +249,6 @@ def __init__(
 
         self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
         self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
-        self.depth = 3 if not se_ratio else 4
 
 
 class ResBottleneckBlock(nn.Module):
@@ -302,7 +290,6 @@ def __init__(
 
         # The projection and transform happen in parallel,
         # and activation is not counted with respect to depth
-        self.depth = self.f.depth
 
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
@@ -341,8 +328,6 @@ def __init__(
             se_ratio,
         )
 
-        self.depth = self.f.depth
-
     def forward(self, x: Tensor) -> Tensor:
         return x + self.f(x) if self.has_skip else self.f(x)
 
@@ -366,7 +351,6 @@ def __init__(
         stage_index: int = 0,
     ) -> None:
         super().__init__()
-        self.stage_depth = 0
 
         for i in range(depth):
             block = block_constructor(
@@ -381,7 +365,6 @@ def __init__(
                 se_ratio,
             )
 
-            self.stage_depth += block.depth
             self.add_module(f"block{stage_index}-{i}", block)
 
 
@@ -516,8 +499,6 @@ def __init__(self, params: RegNetParams) -> None:
 
         current_width = params.stem_width
 
-        self.trunk_depth = 0
-
         blocks = []
         for i, (
             width_out,
@@ -546,8 +527,6 @@ def __init__(self, params: RegNetParams) -> None:
                 )
             )
 
-            self.trunk_depth += blocks[-1][1].stage_depth
-
             current_width = width_out
 
         self.trunk_output = nn.Sequential(OrderedDict(blocks))
@@ -695,7 +674,6 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An
     """
     params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
                           group_width=16, use_se=False, **kwargs)
-
     return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
 
 
From 33ad54e740d765d0927cee952e0096fc38a30877 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Fri, 17 Sep 2021 07:10:50 +0000
Subject: [PATCH 12/40] use BN/activation constructor and ConvBNActivation

---
 torchvision/models/regnet.py | 187 +++++++++++++----------------------
 1 file changed, 66 insertions(+), 121 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index d405945267e..9c3330ec726 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -8,11 +8,12 @@
 import torch
 
 from collections import OrderedDict
+from functools import partial
 from typing import Any, Callable, List, Optional, Tuple
 from torch import nn, Tensor
 
 from .._internally_replaced_utils import load_state_dict_from_url
-from torchvision.models.mobilenetv2 import _make_divisible
+from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible
 
 model_urls = {
     # TODO(kazhang): add pretrained weights
@@ -68,39 +69,17 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
     ) -> None:
-        super().__init__()
-
-        self.a = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
-        )
-
-        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
-
-
-class ResStemCifar(nn.Sequential):
-    """ResNet stem for CIFAR: 3x3, BN, ReLU."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
-    ) -> None:
-        super().__init__()
-        self.stem = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        super().__init__(OrderedDict(
+            a=nn.Sequential(
+                ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
+                                 norm_layer=norm_layer, activation_layer=activation_layer),
+                nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
+            ),
+            final_bn=norm_layer(width_out),
+        ))
 
 
 class ResStemIN(nn.Sequential):
@@ -110,36 +89,28 @@ def __init__(
         self,
         width_in: int,
         width_out: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
     ) -> None:
-        super().__init__()
-        self.stem = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
+        super().__init__(
+            ConvBNActivation(width_in, width_out, kernel_size=7, stride=2,
+                             norm_layer=norm_layer, activation_layer=activation_layer),
             nn.MaxPool2d(3, stride=2, padding=1),
         )
 
 
-class SimpleStemIN(nn.Sequential):
+class SimpleStemIN(ConvBNActivation):
     """Simple stem for ImageNet: 3x3, BN, ReLU."""
 
     def __init__(
         self,
         width_in: int,
         width_out: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
     ) -> None:
-        super().__init__()
-        self.stem = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        super().__init__(width_in, width_out, kernel_size=3, stride=2,
+                         norm_layer=norm_layer, activation_layer=activation_layer)
 
 
 class VanillaBlock(nn.Sequential):
@@ -150,24 +121,17 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         *args,
         **kwargs,
     ) -> None:
-        super().__init__()
-        self.a = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
-
-        self.b = nn.Sequential(
-            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        super().__init__(OrderedDict(
+            a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
+                               norm_layer=norm_layer, activation_layer=activation_layer),
+            b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1,
+                               norm_layer=norm_layer, activation_layer=activation_layer),
+        ))
 
 
 class ResBasicBlock(nn.Module):
@@ -178,9 +142,8 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         *args,
         **kwargs,
     ) -> None:
@@ -190,11 +153,11 @@ def __init__(
             self.proj = nn.Conv2d(
                 width_in, width_out, 1, stride=stride, padding=0, bias=False
             )
-            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+            self.bn = norm_layer(width_out)
         self.f = BasicTransform(
-            width_in, width_out, stride, bn_epsilon, bn_momentum, activation
+            width_in, width_out, stride, norm_layer, activation_layer
         )
-        self.activation = activation
+        self.activation = activation_layer(inplace=True)
 
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
@@ -213,42 +176,35 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int,
         bottleneck_multiplier: float,
         se_ratio: Optional[float],
     ) -> None:
-        super().__init__()
+        layers = OrderedDict()
         w_b = int(round(width_out * bottleneck_multiplier))
         g = w_b // group_width
 
-        self.a = nn.Sequential(
-            nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False),
-            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
-
-        self.b = nn.Sequential(
-            nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False),
-            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        layers["a"] = ConvBNActivation(width_in, w_b, kernel_size=1, stride=1,
+                                       norm_layer=norm_layer, activation_layer=activation_layer)
+        layers["b"] = ConvBNActivation(w_b, w_b, kernel_size=3, stride=stride, groups=g,
+                                       norm_layer=norm_layer, activation_layer=activation_layer)
 
         if se_ratio:
             # The SE reduction ratio is defined with respect to the
             # beginning of the block
             width_se_out = int(round(se_ratio * width_in))
-            self.se = _SqueezeExcitation(
+            layers["se"] = _SqueezeExcitation(
                 in_channels=w_b,
                 reduction_ratio=None,
                 reduced_channels=width_se_out,
-                activation=activation,
+                activation=activation_layer(inplace=True),
             )
 
-        self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
-        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
+        layers["final_bn"] = norm_layer(width_out)
+        super().__init__(layers)
 
 
 class ResBottleneckBlock(nn.Module):
@@ -259,9 +215,8 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int = 1,
         bottleneck_multiplier: float = 1.0,
         se_ratio: Optional[float] = None,
@@ -274,19 +229,18 @@ def __init__(
             self.proj = nn.Conv2d(
                 width_in, width_out, 1, stride=stride, padding=0, bias=False
             )
-            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+            self.bn = norm_layer(width_out)
         self.f = BottleneckTransform(
             width_in,
             width_out,
             stride,
-            bn_epsilon,
-            bn_momentum,
-            activation,
+            norm_layer,
+            activation_layer,
             group_width,
             bottleneck_multiplier,
             se_ratio,
         )
-        self.activation = activation
+        self.activation = activation_layer(inplace=True)
 
         # The projection and transform happen in parallel,
         # and activation is not counted with respect to depth
@@ -307,9 +261,8 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int = 1,
         bottleneck_multiplier: float = 4.0,
         se_ratio: Optional[float] = None,
@@ -320,9 +273,8 @@ def __init__(
             width_in,
             width_out,
             stride,
-            bn_epsilon,
-            bn_momentum,
-            activation,
+            norm_layer,
+            activation_layer,
             group_width,
             bottleneck_multiplier,
             se_ratio,
@@ -342,9 +294,8 @@ def __init__(
         stride: int,
         depth: int,
         block_constructor: Callable[..., nn.Module],
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int,
         bottleneck_multiplier: float,
         se_ratio: Optional[float] = None,
@@ -357,9 +308,8 @@ def __init__(
                 width_in if i == 0 else width_out,
                 width_out,
                 stride if i == 0 else 1,
-                bn_epsilon,
-                bn_momentum,
-                activation,
+                norm_layer,
+                activation_layer,
                 group_width,
                 bottleneck_multiplier,
                 se_ratio,
@@ -398,11 +348,10 @@ def __init__(
         self.stem_type = stem_type
         self.block_type = block_type
         self.activation = activation
+        self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum)
         self.stem_width = stem_width
         self.use_se = use_se
         self.se_ratio = se_ratio if use_se else None
-        self.bn_epsilon = bn_epsilon
-        self.bn_momentum = bn_momentum
         self.num_classes = num_classes
 
     def get_expanded_params(self):
@@ -486,15 +435,12 @@ class RegNet(nn.Module):
     def __init__(self, params: RegNetParams) -> None:
         super().__init__()
 
-        activation = params.activation(inplace=True)
-
         # Ad hoc stem
         self.stem = params.stem_type(
             3,  # width_in
             params.stem_width,
-            params.bn_epsilon,
-            params.bn_momentum,
-            activation,
+            params.norm_layer,
+            params.activation,
         )
 
         current_width = params.stem_width
@@ -516,9 +462,8 @@ def __init__(self, params: RegNetParams) -> None:
                         stride,
                         depth,
                         params.block_type,
-                        params.bn_epsilon,
-                        params.bn_momentum,
-                        activation,
+                        params.norm_layer,
+                        params.activation,
                         group_width,
                         bottleneck_multiplier,
                         params.se_ratio,

From 346aba7007a463cbe9a958472be68f7ace35c76c Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Fri, 17 Sep 2021 07:18:51 +0000
Subject: [PATCH 13/40] add expected test pkl files

---
 .../ModelTester.test_regnet_x_16gf_expect.pkl     | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_x_1_6gf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_x_32gf_expect.pkl     | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_x_3_2gf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_x_400mf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_x_800mf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_x_8gf_expect.pkl      | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_y_16gf_expect.pkl     | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_y_1_6gf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_y_32gf_expect.pkl     | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_y_3_2gf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_y_400mf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_y_800mf_expect.pkl    | Bin 0 -> 939 bytes
 .../ModelTester.test_regnet_y_8gf_expect.pkl      | Bin 0 -> 939 bytes
 14 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 test/expect/ModelTester.test_regnet_x_16gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_x_1_6gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_x_32gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_x_400mf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_x_800mf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_x_8gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_y_16gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_y_1_6gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_y_32gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_y_400mf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_y_800mf_expect.pkl
 create mode 100644 test/expect/ModelTester.test_regnet_y_8gf_expect.pkl

diff --git a/test/expect/ModelTester.test_regnet_x_16gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_16gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..30c9debc3e7b9863050deb266fb99bf86fa9e0d8
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK66d9fqMedc^4_>$*S&Mf+imwL&DdA)X3}2v4KjNlhSu9Y<X>UCKYHojltr8O
znpw8kE*8_UopM2TU&4zWR(vO??0wl6VOJl{W_S3`LtCcz4Et0s?%m7(&CKq!!JR!b
z)~W5gvyX9K$9l_sZ-T>j?Xh6BQ(h=%*Kj&xPv0yZyFHf@>>BL&_i=UT*iQKWWUoeJ
zk<E(o$9unC{<Syti;>;5!yD{gO#Ec+lw@FcUTcx9blyB$pI%P8zz4o|pF0}tzMSyb
z>%OFGpV2=JJ5Xr-xM3@E0vJ*tj61aW8LZ)<RhC**42%UgCvzf$3n}CvOk*yP%@^mT
zhcW?e1>pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq
z5a7+mrUO+Z$E*ui4ob`*0He1<xD1oPo&<T94U{(+JfR9enIOQMl?^1u41^%{5VZjO
CZu(jP

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_x_1_6gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_1_6gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..d9d3216aa579a7e0c589db6e51365719137b01cf
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5=VYZ_MT(wl5IR3SM8B3oV3RwRn8`Vnb+QZlTX-eIC^Q%jy%1+Yz;=XhwP{B
z;)uIqv!d(D?%R7jZCZRc?J3(Uxu-<t{hsL8m3#J_j<AVVo@<l0cct~LV`6(x)~W72
zIK$RPfw#n_Xm6zLv-Vkge)B2sW#~`cEmZz^_j$X8d*Tz0+HA?%Xu}b#x{IykwsoX{
z{NBfG6KqnNw(O~7nzFl1TFG`o<GMWp+e2)2&YW(uveUrUYyG1=|KEn})qKyrr{hD{
z?n5CCdydZkV+{(ejKvSNP5?s+gmH%!KZ7+qw8~P8ih;4<=44J}a3O^pglWtLviah?
z^iU?CtsorW%?P64X%aaO1waxg06m4G>qhnyABxUbKpwJgeFJp8$gbi?(JKLTAxtkc
z3<A8_*mR(Z<d}8g%0Y=41Yq=b2$x|J*pnddvVrmjgC|r0C=&#Dv$BE2n1K+a9-<Zi
DBP;t`

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_x_32gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_32gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1e5bc20e5726833e0e4397cc0f65791c14bf7be3
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5+~Geo$V^VFMFTHPP8se?B4Ta&5ylrvO8=y1@YT>D6ic6gu&77p!nUr{vIuR
zZm=)ew`z~Xz8gP;Z8ja5XnTE%%)V}oueK7m!)$w|IPE#G^oymD@yfjq4kp`1ma*I1
z51nYcVzaBwukPTziG~Gs)50e2O=|gP+jQ-z&92kjdxbuB+9;HY?+q!Kwx`Tp)UM^v
z_B|&zy|7KPTyMjyl5Y3n)uFv>U$fXg&z!tRa)OYZLHxwMQk<6iSf6(7dnli~m(TR%
z-UvNj+jqPHdqJTkd`E2a31CQpFz(ReXRwBcR#|FMF)$X~oXm*~E~JoyFparDHeZ~V
z9?Ar?6@&x489@|0O(Msk07wD_pr=rD-N=68L(%yP$V1kxZ-A~B*;V`~dL@7^gz1Hb
zL4Y?Kn+{Zw9J4N5IVdrM0F2%a;WA7DdlKYbHc;MR@PsM=Wr6^2RyL3rGZ2E*L(~EQ
D?vDAy

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_3_2gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..bd745ed1f0558bee8b4293ab34291e2dcbbec917
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK66ZJ16kC4g`Fo%45Vc#>8e;oXy3UqeaKE*3QRUw1mb$%Hix2HRq$g_IzhJ^%
z^MJm+JUpT{uk!8oZnW>*>y+cS_iXIny}tx_?b2_(*gIwRmA&~A!uz&3+U-pV?y_-V
z+-!SfhTq=S*2%Wp4sY4Jv`Tr;QcLl@@=v7pYOR*G%m2b@d*ENUt-#z9d(Y%^?=IY3
zzvs@L1$+6GZtP7CDYrd-M8)pD;IF+V4YIbAFJ9eypEb*N(wEbFtF}L|UHxmStt3z2
z-bt+|_AFhUWeW<eIh_-ZoB)Ot2;&Ydeg<oJXqBZF6$4|z&B>g|;6e&H2-BDgWb?&&
z>7h(OTR}L$n-N68(<E{n3V<X~0D1~V*NyBaJ`|m=fIMW~`UdEFkzK`)qE`awLYQ7?
z7zB8;vFSh+$uaA~m4gy92*Bv=5H7<cuqQ#@Wdr3622ZF0P$mfQW@Q73F#{n;Jwz=4
D=g|BT

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_x_400mf_expect.pkl b/test/expect/ModelTester.test_regnet_x_400mf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..f3488deb2876e7ec867a94b833abb472eef9185e
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5@+@!UK>-ld3!Eyo4Myt%#=M``g8W~KQrHY#>ck3OXfuEIr>w}=2%~W%|{m2
zy@hwDSSyAI+6FYU?v+VDY_oyQWf$v;nmtMD-`Y$#sc&<OFLBSBiln`9Q?BeboS$av
z`Ac~3)A=g45(_f++~H2KF*w|}JNVM{y~kFo?3FzsW+Sjyc(47LO?!OjFR__b$+mah
zLND9sI6oUr0X|!<{|tK%9?i8`GTmm+7WT(Bp*C!`jMMV>gg-6Y^OSkAjZenwJw~ql
zHi=&p_pG^jZx1N66lSd#JOK<T5XK!^{0!Fc&?-wUDh9@ao0B<_!G#oZ5T-E~$mWal
z(nFbmwt{egHzSCGr%B{E6aYz}0Q3}!t{d4;d?-3!0eQ%}^$pPVBD;zoMXv<Vg)qI)
zFbMEwW7B~ul4I6|D+eWJ5P;F!AzX$@U{8X)%Ld9D44zO0piB_p&B_K6V+KNydWc#8
DAGr3*

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_x_800mf_expect.pkl b/test/expect/ModelTester.test_regnet_x_800mf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..efb71925b286e5f2016eeb57678bf4ca4ba32d5a
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5=T9fdymOC4(my(?0en1Zd*l7Vc&gaQR^O#N?zMjT8eu-*QM`qzPWYJ`+E&G
z{r4v9*)1V&bGTyRo<oAWZA@M`*z`I4vtc&t+Y@&_a?ig+e;eg4KATN;H*6}Vvh2=j
zYO(pF(q`pwdGqd)tygxNadhpP{C(q|`zyKka=BmLBYd^dCS(4eJ@2wk+x$;hx2M3f
z$Hw*HPwTMfi)@y%T-#mpW}eMnwoaR<y-K#fzIEAjJ6rE=w7jxA{N=Pg0Utl@*<GNr
zcdH%Go`hK&tw5orG4)=;31CQpFz(ReXRwBcR#|FMF)$X~oXm*~E~JoyFparDHeZ~V
z9?Ar?6@&x489@|0O(Msk07wD_pr=rD-N=68L(%yP$V1kxZ-A~B*;V`~dL@7^gz1Hb
zL4Y?Kn+{Zw9J4N5IVdrM0F2%a;WA7DdlKYbHc;MR@PsM=Wr6^2RyL3rGZ2E*L(~EQ
D5j6bM

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_x_8gf_expect.pkl b/test/expect/ModelTester.test_regnet_x_8gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1a2dbabbd1cf5e4e37702b96865fd1af95a4f4d5
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK66by9{yiQF+xC7B*k*U+w&VUgyB69wEPiLFGMUZJKX2E*>2(2i$6g26U#!}<
z&mr@JZBOGYyBWOsdv_$C+WXL6%Z{U5Vjs(KAv=eq#(VlVPu+Lc>5g4imEiuR+Rb)v
z(zou*tjM%C)3n_8M@MM?&4)$%4xC@L?@CneKK{?%cEwh9cI@YV?{i$MvH!2NrQMSE
zJNCHTQ?OsKZo5s;*}t|oCaBme$BNh;2=m+LvH$QsgXaSFTqenOWqtAcZEV^1iB00%
zXDxhh-<KUXZ9$=RU$*4l31CQpFz(ReXRwBcR#|FMF)$X~oXm*~E~JoyFparDHeZ~V
z9?Ar?6@&x489@|0O(Msk07wD_pr=rD-N=68L(%yP$V1kxZ-A~B*;V`~dL@7^gz1Hb
zL4Y?Kn+{Zw9J4N5IVdrM0F2%a;WA7DdlKYbHc;MR@PsM=Wr6^2RyL3rGZ2E*L(~EQ
Dx-R=C

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_y_16gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_16gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..53d4c18a016435db8504b61c0ea8b3506dacf6c9
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK66eh2Q#LG{I&D6Aitm{ioMYwl`{gdl3$455?BjNKtAE;k=&;`&i^<O{HEb{L
z7Jm9=cl^>)8+De6Hri<uY!tn^ZCHxCY~E{K+0*|wa(BXm;9XsQ_IrYM81MPHbe2tg
z$Rz7<=`*_n-ml&z<9}(p8DH1VGV@Lw7LCs~uSFU5=%<~s_D)!8z2HFnp4Uo>dsfc=
zWG$a&uxHzW1{?cnCw9*W*lIIHam%ji<7aliyVbg<>LK@@!t5BE=bOJ;Cr63fJhNTC
zTct93Pr2SBYfxytoMM!70vJ*tj61aW8LZ)<RhC**42%UgCvzf$3n}CvOk*yP%@^mT
zhcW?e1>pd1Mi2#0lgM!>0Fpof=qVIkH?p7jP;|Zm@{o1w8=&h&b`?L0UJ0NJVS1rq
z5a7+mrUO+Z$E*ui4ob`*0He1<xD1oPo&<T94U{(+JfR9enIOQMl?^1u41^%{5VZhX
Cr~7FD

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_y_1_6gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_1_6gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..633c2d2f23887bde116dba046f5f3d61c6eedd53
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5~s9yjrH>eXPfO8R@?lF3$%G9^lMiFyP3`H0}M9*qYU@F_WEqibt!OnZQ%iH
zhFKCem7gcr<a<B0jxk}fk+NvpvpjU0m9O(}Ymd04Hidl#HkV^2?y=~Yw0kaZ#~$as
zb9XnKd1BG2W^MD$?fGtl^DB4H()zMHw87u{|Fs3y^FK-2OpaIIqghp9qj^PVcX{1i
z>w}>(R)!x}TBkf_*>g^R!)}Sq;Wm4t>TQC~{N5e*scV<;sr|csH|5y;?-Q~4ci@oC
zl3UT%@?mUyK%te~rSEV87*ZgNJGA&2tl^<mmReK{j0HC*b0UKaDdZqbV=j=*7w4si
zG68J`;Q((&5Cu<@$Z;qDl0X6IDHL5dvY+@+biM-ekag=DpzB3;6+enz37`vMdZA$u
z;LXOS163r)tP58TO3WYtqqjr243of~1bLSYls6bWp$b5mAi$fI4J5`4gdp`0wE!y(
B_xS(-

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_y_32gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_32gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e3d46ccff97c381b6852ce1ac0411dd853b3f336
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK66c|W>uzIp?!B3pkMEftqi?<R;%V!g!;X6%IF{P@%KqMS{L8`J$J-h93Qp76
z^~uN6cDiPojr_?s*4Mf@_x#M-YLmC1*?I-T8=Hh}T6_5AkL*@V_`G{n@|QgcM*6#M
z+s5v0VhpefG5xpal!x-(qn(rXluntkJ8$w)tK7TytxL`1Y~qd?*xXy-ZuREE{5?;<
zAKx?CXy%^D=d<?IPTy^FrpVOVLsHpx&UBGILNUcV9@x*ZW@fD0<8W=#?j*(NJxfiQ
zZC}s%Y2E2zxf>K(I?NHGCx9UZ!ni|=pTQa)T4kw4#lTo_b229~xR62)!ZhXr*?e(c
zdMFdnRuB&GW&~02G>IIC0w4(#fSy9pbtC(U4@KuIAP-r$z5%*kWLNQ{=#>Dv5T+Lz
z1_9n|Y&uXya?HAL<)Fk20x)_zgv&4q>`9P!*+6-N!4s+glnDa7S=m5h%s>cI4^ayM
DYvA~D

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_3_2gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c2daa7514d3fcdb64d80b7febad6a2c6c5907cc7
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK631ho`kv6mE3CV3#o4^uu-Zmyb@=XGxxsthr|;Vxxoe?ymx+<hzxnHI`fi`H
z-rmu-ho!F8Mk6+I&n%w<yZ9=1*(fKQ?y3GUVNaE;^qxs-Og2ZRU)_D_q_OqYj7hr_
zb@F#FN;+lpEvm&PNLSJNip-JSQ|ngkVsKe(<}yLn%HrbLJ*vrEd$Mjb*q*t_VEtg_
zyWQmhVm8W?C2STfe`1~8X1=@TpPkLq6gBH3)eEgZ?V4b1xZca=HQ&YE{81fN=d|uv
z=V>wQ;kb2gHz>4}1bXM60EQF@;|?u;25Wd|m8BLH17pF>$(+dGLJBzu)0hin^Tm1T
zp-ezqK{&vh5k$e$Byt=IfFw`=dJ09?jqE2r6rHbtJY?Pa2IzW`UB!>0R|4olm|kcY
z1bDNt=|C09G3&yWgAy|c!07D|F2f|SCqdq21LX|{PpAS=CJ69mWdn&Z10hH~L@fY{
Ch4%aa

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_y_400mf_expect.pkl b/test/expect/ModelTester.test_regnet_y_400mf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..2082c49cab9f735f69745c124436eb53cfda3ec7
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK632p_b+6r&je9y}yY@WKUbXACWbN)`zLhpC&L%dm${4JRva@#|jCI?y=g@hp
zN5T`WWjt2xW)M-b*0zw?vunl*>p2#VyK|Xz_lR5;*fp2y`kq;t&U+a4@mrtc(%JK!
zE6qkQV6Dx(jaoKh_jC5VEttGpLnePuO>%{e;x?UKJNH!X5%=b|Q5Idl+wiA{O&W{$
zt`C{=dva!Mw$f){-m_n#%SNw$?(R>G{nj~#x2;_zC)rf1+U=Qf>y4F(VB&6uqa1r4
zt9n^&vy<Ed3atRcMI0x9AqB#?LyMom8Xj6@sYS)WSa5SPCo;H@LJq<-<^tJ#ab9{T
z6VO%=4)A6KQSda09ESoR2^4^yLeX_2`-u-l=PMu&S+~9cx?W^g@uTRK0J;#S7a9fu
z-fV0-P(^agx^U&7#0&y3dOL*6FbV8QkayWYd4s_dssNM;0=!w-Kw`{52vQGG3jn`<
B>|Ou>

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_y_800mf_expect.pkl b/test/expect/ModelTester.test_regnet_y_800mf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..a27eefdfd299fc38dad0b600cdb2d35effa3d9e8
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5@-3x&fUJUWj51hGh5yH|7W+pg!Qg7sm{BF_FUh+-&=0C>#VljLLC}5t3-u&
zMXLYVDZM{=Px^{UR%YB5yXNxvSfA6lwCk{(=k67!r|v$P%DHDf8;8yAzXiMB%1Ya0
zzA)U~66b5V-86i+k$|MNkxucRZyKw2Ep6@F9n{>gQ*^~FYeRj<-N%+I*@T5Jw0gDm
zx7B6)p50H<k6H8OvRU&VVAy5y^z|;*1Wp^-tUPO%_vzL<tO|Fx%P{T<UUqS}mR_dK
ziOU<U-Cb<=fI>^dbb;FmU`T;5?$F|Au!e_LS!z)+Fc#dL%!v#xq>zI!jk!QJU!0d7
z$^^6(gaf=8K@>bqBFCWsNCE|*r%-g=$bRBO(fJC<L)NWtfUXzWRs1M=C4er3>4k<t
zfHxbP4pfmGvo2gYC^3TojNT66GE4${669SrP~Kqhgem}Kf&gz;Hjo%I5Q5Z0)B*r3
Clk&g-

literal 0
HcmV?d00001

diff --git a/test/expect/ModelTester.test_regnet_y_8gf_expect.pkl b/test/expect/ModelTester.test_regnet_y_8gf_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..aa12345882f7c6eb2a489240d108af1717fef416
GIT binary patch
literal 939
zcmWIWW@cev;NW1u00Im`42ea_8JT6N`YDMeiFyUuIc`pT3{fbcfhoBpAE-(%zO*DW
zr<g0gC^e}xGbbg!BsH%%zbL-Uh^vr6LnDG6XnrwJWny}2AtOW!ms?JLVo5MWkgJd>
zf)S|3ppZF&8AvA=loqmh8<iBY26{7iGkP1f6|#97c{8>ZvUemW=jY_4CYNO9=M{7L
z7p0^YrKY%KCYNv(a%ct>a+VZw1r>7Z1$eV_Fj*X^nFTZrgadH;l#f9R#i#lPZcb`w
z{zUOK5~sfBmi1;o<2}FDKDL>$%FgESR)O8Y-flLLHJ7YsWPGxTThXv5F7A|d{EIm@
z*ACm*#4ozJNBq~~J-nsHHd2YPHg6vDSzp(X+LJMleNUrM*`C{ndaavIM(ok&yl-Rr
zx5GwfO`{Fx%g#L>(;KZfH+9;ae#vc<aJI<eLZA8`wcV0?t<(f;)GY(|+}!@bdcBFn
z?wem`TZcxwSc~*B+jxlBSa<dJ?7p^Vy3LO7XRSLv`diQ4>}^$C@O@8`$Q+xc5&U}=
zFKOOWo+@So3a!~{Zw*cWLkfg(hZaABH9WM+Qj3a#vEb%pPGoQ)g&c%w%muRf;=J@w
zCZMe#9N^6eqTp!~ISvIt5-0#Yg`(?5_7fk9&R0MlvTl6?biK%~;z!Xd0dyfuFEk7S
zyxG`vpo-*}b>YfEi5Uc7^mYiBVG`JrAn&q)@&<z^Q~@Xx1bDNufy9`B5TqWW763+n
B_E-P_

literal 0
HcmV?d00001


From 852593de1dc847b976dfeccba8132a71457d2e05 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Sun, 19 Sep 2021 22:59:51 +0000
Subject: [PATCH 14/40] allow custom activation in SqueezeExcitation

---
 torchvision/models/efficientnet.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index bad5b57b25b..a9f8ac5a7c0 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -36,14 +36,14 @@ def __init__(
         self,
         input_channels: int,
         squeeze_channels: int,
-        activation: Callable[..., nn.Module] = nn.ReLU,
-        scale_activation: Callable[..., nn.Module] = nn.Sigmoid,
+        activation: Optional[Callable[..., nn.Module]] = None,
     ) -> None:
         super().__init__()
         self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1)
         self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1)
-        self.activation = activation()
-        self.scale_activation = scale_activation()
+        if activation is None:
+            activation = nn.SiLU
+        self.activation = activation(inplace=True)
 
     def _scale(self, input: Tensor) -> Tensor:
         scale = F.adaptive_avg_pool2d(input, 1)

From e4863079d9ca542e585bcc16eebfa34bb8542f47 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Mon, 20 Sep 2021 20:04:51 +0000
Subject: [PATCH 15/40] use ReLU as the default activation

---
 torchvision/models/efficientnet.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index a9f8ac5a7c0..dbfb6bb7dd7 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -36,14 +36,12 @@ def __init__(
         self,
         input_channels: int,
         squeeze_channels: int,
-        activation: Optional[Callable[..., nn.Module]] = None,
+        activation: Callable[..., nn.Module] = nn.ReLU,
     ) -> None:
         super().__init__()
         self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1)
         self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1)
-        if activation is None:
-            activation = nn.SiLU
-        self.activation = activation(inplace=True)
+        self.activation = activation()
 
     def _scale(self, input: Tensor) -> Tensor:
         scale = F.adaptive_avg_pool2d(input, 1)

From 8cab2bbd70484319a50a631f108dc6fd410c54a8 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 00:15:55 +0000
Subject: [PATCH 16/40] initial code

---
 torchvision/models/regnet.py | 707 +----------------------------------
 1 file changed, 6 insertions(+), 701 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 9c3330ec726..96493ae3c4c 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,708 +1,13 @@
-# Modified from
-# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py
-# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
-
-
-import numpy as np
-import math
-import torch
-
-from collections import OrderedDict
-from functools import partial
-from typing import Any, Callable, List, Optional, Tuple
-from torch import nn, Tensor
-
-from .._internally_replaced_utils import load_state_dict_from_url
-from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible
-
-model_urls = {
-    # TODO(kazhang): add pretrained weights
-    "regnet_y_400m": "",
-}
-
-
-class _SqueezeExcitation(nn.Module):
-    """
-    Squeeze and excitation layer from
-    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        reduction_ratio: Optional[int] = 16,
-        reduced_channels: Optional[int] = None,
-        activation: Optional[nn.Module] = None,
-    ) -> None:
-        super().__init__()
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-
-        # Either reduction_ratio is defined, or out_channels is defined,
-        # neither both nor none of them
-        assert bool(reduction_ratio) != bool(reduced_channels)
-
-        if activation is None:
-            activation = nn.ReLU()
-
-        reduced_channels = (
-            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
-        )
-        self.excitation = nn.Sequential(
-            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
-            activation,
-            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
-            nn.Sigmoid(),
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        x_squeezed = self.avgpool(x)
-        x_excited = self.excitation(x_squeezed)
-        x_scaled = x * x_excited
-        return x_scaled
-
-
-class BasicTransform(nn.Sequential):
-    """Basic transformation: [3x3 conv, BN, Relu] x2."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-    ) -> None:
-        super().__init__(OrderedDict(
-            a=nn.Sequential(
-                ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
-                                 norm_layer=norm_layer, activation_layer=activation_layer),
-                nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
-            ),
-            final_bn=norm_layer(width_out),
-        ))
-
-
-class ResStemIN(nn.Sequential):
-    """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-    ) -> None:
-        super().__init__(
-            ConvBNActivation(width_in, width_out, kernel_size=7, stride=2,
-                             norm_layer=norm_layer, activation_layer=activation_layer),
-            nn.MaxPool2d(3, stride=2, padding=1),
-        )
-
-
-class SimpleStemIN(ConvBNActivation):
-    """Simple stem for ImageNet: 3x3, BN, ReLU."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-    ) -> None:
-        super().__init__(width_in, width_out, kernel_size=3, stride=2,
-                         norm_layer=norm_layer, activation_layer=activation_layer)
-
-
-class VanillaBlock(nn.Sequential):
-    """Vanilla block: [3x3 conv, BN, Relu] x2."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        *args,
-        **kwargs,
-    ) -> None:
-        super().__init__(OrderedDict(
-            a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
-                               norm_layer=norm_layer, activation_layer=activation_layer),
-            b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1,
-                               norm_layer=norm_layer, activation_layer=activation_layer),
-        ))
-
-
-class ResBasicBlock(nn.Module):
-    """Residual basic block: x + F(x), F = basic transform."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        *args,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        self.proj_block = (width_in != width_out) or (stride != 1)
-        if self.proj_block:
-            self.proj = nn.Conv2d(
-                width_in, width_out, 1, stride=stride, padding=0, bias=False
-            )
-            self.bn = norm_layer(width_out)
-        self.f = BasicTransform(
-            width_in, width_out, stride, norm_layer, activation_layer
-        )
-        self.activation = activation_layer(inplace=True)
-
-    def forward(self, x: Tensor) -> Tensor:
-        if self.proj_block:
-            x = self.bn(self.proj(x)) + self.f(x)
-        else:
-            x = x + self.f(x)
-
-        return self.activation(x)
-
-
-class BottleneckTransform(nn.Sequential):
-    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        group_width: int,
-        bottleneck_multiplier: float,
-        se_ratio: Optional[float],
-    ) -> None:
-        layers = OrderedDict()
-        w_b = int(round(width_out * bottleneck_multiplier))
-        g = w_b // group_width
-
-        layers["a"] = ConvBNActivation(width_in, w_b, kernel_size=1, stride=1,
-                                       norm_layer=norm_layer, activation_layer=activation_layer)
-        layers["b"] = ConvBNActivation(w_b, w_b, kernel_size=3, stride=stride, groups=g,
-                                       norm_layer=norm_layer, activation_layer=activation_layer)
-
-        if se_ratio:
-            # The SE reduction ratio is defined with respect to the
-            # beginning of the block
-            width_se_out = int(round(se_ratio * width_in))
-            layers["se"] = _SqueezeExcitation(
-                in_channels=w_b,
-                reduction_ratio=None,
-                reduced_channels=width_se_out,
-                activation=activation_layer(inplace=True),
-            )
-
-        layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
-        layers["final_bn"] = norm_layer(width_out)
-        super().__init__(layers)
-
-
-class ResBottleneckBlock(nn.Module):
-    """Residual bottleneck block: x + F(x), F = bottleneck transform."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        group_width: int = 1,
-        bottleneck_multiplier: float = 1.0,
-        se_ratio: Optional[float] = None,
-    ) -> None:
-        super().__init__()
-
-        # Use skip connection with projection if shape changes
-        self.proj_block = (width_in != width_out) or (stride != 1)
-        if self.proj_block:
-            self.proj = nn.Conv2d(
-                width_in, width_out, 1, stride=stride, padding=0, bias=False
-            )
-            self.bn = norm_layer(width_out)
-        self.f = BottleneckTransform(
-            width_in,
-            width_out,
-            stride,
-            norm_layer,
-            activation_layer,
-            group_width,
-            bottleneck_multiplier,
-            se_ratio,
-        )
-        self.activation = activation_layer(inplace=True)
-
-        # The projection and transform happen in parallel,
-        # and activation is not counted with respect to depth
-
-    def forward(self, x: Tensor) -> Tensor:
-        if self.proj_block:
-            x = self.bn(self.proj(x)) + self.f(x)
-        else:
-            x = x + self.f(x)
-        return self.activation(x)
-
-
-class ResBottleneckLinearBlock(nn.Module):
-    """Residual linear bottleneck block: x + F(x), F = bottleneck transform."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        group_width: int = 1,
-        bottleneck_multiplier: float = 4.0,
-        se_ratio: Optional[float] = None,
-    ) -> None:
-        super().__init__()
-        self.has_skip = (width_in == width_out) and (stride == 1)
-        self.f = BottleneckTransform(
-            width_in,
-            width_out,
-            stride,
-            norm_layer,
-            activation_layer,
-            group_width,
-            bottleneck_multiplier,
-            se_ratio,
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        return x + self.f(x) if self.has_skip else self.f(x)
-
-
-class AnyStage(nn.Sequential):
-    """AnyNet stage (sequence of blocks w/ the same output shape)."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        depth: int,
-        block_constructor: Callable[..., nn.Module],
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        group_width: int,
-        bottleneck_multiplier: float,
-        se_ratio: Optional[float] = None,
-        stage_index: int = 0,
-    ) -> None:
-        super().__init__()
-
-        for i in range(depth):
-            block = block_constructor(
-                width_in if i == 0 else width_out,
-                width_out,
-                stride if i == 0 else 1,
-                norm_layer,
-                activation_layer,
-                group_width,
-                bottleneck_multiplier,
-                se_ratio,
-            )
-
-            self.add_module(f"block{stage_index}-{i}", block)
-
+from torch import nn
 
 class RegNetParams:
-    def __init__(
-        self,
-        depth: int,
-        w_0: int,
-        w_a: float,
-        w_m: float,
-        group_width: int,
-        bottleneck_multiplier: float = 1.0,
-        stem_type: Callable[..., nn.Module] = SimpleStemIN,
-        stem_width: int = 32,
-        block_type: Callable[..., nn.Module] = ResBottleneckBlock,
-        activation: Callable[..., nn.Module] = nn.ReLU,
-        use_se: bool = True,
-        se_ratio: float = 0.25,
-        bn_epsilon: float = 1e-05,
-        bn_momentum: float = 0.1,
-        num_classes: int = 1000,
-    ) -> None:
-        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
-            raise ValueError("Invalid RegNet settings")
-        self.depth = depth
-        self.w_0 = w_0
-        self.w_a = w_a
-        self.w_m = w_m
-        self.group_width = group_width
-        self.bottleneck_multiplier = bottleneck_multiplier
-        self.stem_type = stem_type
-        self.block_type = block_type
-        self.activation = activation
-        self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum)
-        self.stem_width = stem_width
-        self.use_se = use_se
-        self.se_ratio = se_ratio if use_se else None
-        self.num_classes = num_classes
-
-    def get_expanded_params(self):
-        """
-        Programatically compute all the per-block settings,
-        given the RegNet parameters.
-
-        The first step is to compute the quantized linear block parameters,
-        in log space. Key parameters are:
-        - `w_a` is the width progression slope
-        - `w_0` is the initial width
-        - `w_m` is the width stepping in the log space
-
-        In other terms
-        `log(block_width) = log(w_0) + w_m * block_capacity`,
-        with `bock_capacity` ramping up following the w_0 and w_a params.
-        This block width is finally quantized to multiples of 8.
-
-        The second step is to compute the parameters per stage,
-        taking into account the skip connection and the final 1x1 convolutions.
-        We use the fact that the output width is constant within a stage.
-        """
-
-        QUANT = 8
-        STRIDE = 2
-
-        # Compute the block widths. Each stage has one unique block width
-        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
-        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
-        block_widths = (
-            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
-            * QUANT
-        )
-        num_stages = len(np.unique(block_widths))
-        block_widths = block_widths.astype(int).tolist()
-
-        # Convert to per stage parameters
-        split_helper = zip(
-            block_widths + [0],
-            [0] + block_widths,
-            block_widths + [0],
-            [0] + block_widths,
-        )
-        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
-
-        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
-        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
-
-        strides = [STRIDE] * num_stages
-        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
-        group_widths = [self.group_width] * num_stages
-
-        # Adjust the compatibility of stage widths and group widths
-        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
-            stage_widths, bottleneck_multipliers, group_widths
-        )
-
-        return zip(
-            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
-        )
-
-    @staticmethod
-    def _adjust_widths_groups_compatibilty(
-            stage_widths: List[int], bottleneck_ratios: List[float],
-            group_widths: List[int]) -> Tuple[List[int], List[int]]:
-        """
-        Adjusts the compatibility of widths and groups,
-        depending on the bottleneck ratio.
-        """
-        # Compute all widths for the current settings
-        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
-        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
-
-        # Compute the adjusted widths so that stage and group widths fit
-        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
-        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
-        return stage_widths, group_widths_min
+    pass
 
+class SqueezeExcitation(nn.Module):
+    pass
 
 class RegNet(nn.Module):
-    def __init__(self, params: RegNetParams) -> None:
-        super().__init__()
-
-        # Ad hoc stem
-        self.stem = params.stem_type(
-            3,  # width_in
-            params.stem_width,
-            params.norm_layer,
-            params.activation,
-        )
-
-        current_width = params.stem_width
-
-        blocks = []
-        for i, (
-            width_out,
-            stride,
-            depth,
-            group_width,
-            bottleneck_multiplier,
-        ) in enumerate(params.get_expanded_params()):
-            blocks.append(
-                (
-                    f"block{i+1}",
-                    AnyStage(
-                        current_width,
-                        width_out,
-                        stride,
-                        depth,
-                        params.block_type,
-                        params.norm_layer,
-                        params.activation,
-                        group_width,
-                        bottleneck_multiplier,
-                        params.se_ratio,
-                        stage_index=i + 1,
-                    ),
-                )
-            )
-
-            current_width = width_out
-
-        self.trunk_output = nn.Sequential(OrderedDict(blocks))
-
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes)
-
-        # Init weights and good to go
-        self._init_weights()
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.stem(x)
-        x = self.trunk_output(x)
-
-        x = self.avgpool(x)
-        x = x.flatten(start_dim=1)
-        x = self.fc(x)
-
-        return x
-
-    def _init_weights(self) -> None:
-        # Performs ResNet-style weight initialization
-        for m in self.modules():
-            if isinstance(m, nn.Conv2d):
-                # Note that there is no bias due to BN
-                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out))
-            elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1.0)
-                m.bias.data.zero_()
-            elif isinstance(m, nn.Linear):
-                m.weight.data.normal_(mean=0.0, std=0.01)
-                m.bias.data.zero_()
-
-
-def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
-    model = RegNet(params)
-    if pretrained:
-        if arch not in model_urls:
-            raise ValueError(f"No checkpoint is available for model type {arch}")
-        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
-        model.load_state_dict(state_dict)
-    return model
-
+    pass
 
 def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetY_400MF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
-    return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)
-
-
-def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetY_800MF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs)
-    return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs)
-
-
-def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetY_1.6GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs)
-    return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetY_3.2GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs)
-    return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetY_8GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs)
-    return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetY_16GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs)
-    return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetY_32GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs)
-    return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetX_400MF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
-                          group_width=16, use_se=False, **kwargs)
-    return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
-
-
-def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetX_800MF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16,
-                          use_se=False, **kwargs)
-    return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs)
-
-
-def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetX_1.6GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24,
-                          use_se=False, **kwargs)
-    return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetX_3.2GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48,
-                          use_se=False, **kwargs)
-    return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetX_8GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120,
-                          use_se=False, **kwargs)
-    return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetX_16GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128,
-                          use_se=False, **kwargs)
-    return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs)
-
-
-def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    """
-    Constructs a RegNetX_32GF architecture from
-    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
-
-    Args:
-        pretrained (bool): If True, returns a model pre-trained on ImageNet
-        progress (bool): If True, displays a progress bar of the download to stderr
-    """
-    params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168,
-                          use_se=False, **kwargs)
-    return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs)
-
-# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF
+    pass

From 12b9d72885e068a3a8dca7f70b4c293777f608a2 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 05:08:36 +0000
Subject: [PATCH 17/40] add SqueezeExcitation

---
 torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 96493ae3c4c..eff8273695a 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,10 +1,46 @@
-from torch import nn
+from torch import nn, Tenspr
 
 class RegNetParams:
     pass
 
 class SqueezeExcitation(nn.Module):
-    pass
+    """
+    Squeeze and excitation layer from 
+    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        reduction_ratio: Optional[int] = 16,
+        reduced_channels: Optional[int] = None,
+        activation: Optional[nn.Module] = None,
+    ) -> None:
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        # Either reduction_ratio is defined, or out_channels is defined,
+        # neither both nor none of them
+        assert bool(reduction_ratio) != bool(reduced_channels)
+
+        if activation is None:
+            activation = nn.ReLU()
+
+        reduced_channels = (
+            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
+        )
+        self.excitation = nn.Sequential(
+            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
+            activation,
+            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x_squeezed = self.avgpool(x)
+        x_excited = self.excitation(x_squeezed)
+        x_scaled = x * x_excited
+        return x_scaled
 
 class RegNet(nn.Module):
     pass

From 89fbb2b95ed6cec1e6bb365afd123745a952ae9d Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 00:15:55 +0000
Subject: [PATCH 18/40] initial code

---
 torchvision/models/regnet.py | 40 ++----------------------------------
 1 file changed, 2 insertions(+), 38 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index eff8273695a..96493ae3c4c 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,46 +1,10 @@
-from torch import nn, Tenspr
+from torch import nn
 
 class RegNetParams:
     pass
 
 class SqueezeExcitation(nn.Module):
-    """
-    Squeeze and excitation layer from 
-    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        reduction_ratio: Optional[int] = 16,
-        reduced_channels: Optional[int] = None,
-        activation: Optional[nn.Module] = None,
-    ) -> None:
-        super().__init__()
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-
-        # Either reduction_ratio is defined, or out_channels is defined,
-        # neither both nor none of them
-        assert bool(reduction_ratio) != bool(reduced_channels)
-
-        if activation is None:
-            activation = nn.ReLU()
-
-        reduced_channels = (
-            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
-        )
-        self.excitation = nn.Sequential(
-            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
-            activation,
-            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
-            nn.Sigmoid(),
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        x_squeezed = self.avgpool(x)
-        x_excited = self.excitation(x_squeezed)
-        x_scaled = x * x_excited
-        return x_scaled
+    pass
 
 class RegNet(nn.Module):
     pass

From df4890387c7deca666721ccdcf881050ffa4871e Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 9 Sep 2021 05:08:36 +0000
Subject: [PATCH 19/40] add SqueezeExcitation

---
 torchvision/models/regnet.py | 40 ++++++++++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 96493ae3c4c..eff8273695a 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,10 +1,46 @@
-from torch import nn
+from torch import nn, Tenspr
 
 class RegNetParams:
     pass
 
 class SqueezeExcitation(nn.Module):
-    pass
+    """
+    Squeeze and excitation layer from 
+    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        reduction_ratio: Optional[int] = 16,
+        reduced_channels: Optional[int] = None,
+        activation: Optional[nn.Module] = None,
+    ) -> None:
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+        # Either reduction_ratio is defined, or out_channels is defined,
+        # neither both nor none of them
+        assert bool(reduction_ratio) != bool(reduced_channels)
+
+        if activation is None:
+            activation = nn.ReLU()
+
+        reduced_channels = (
+            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
+        )
+        self.excitation = nn.Sequential(
+            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
+            activation,
+            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
+            nn.Sigmoid(),
+        )
+
+    def forward(self, x: Tensor) -> Tensor:
+        x_squeezed = self.avgpool(x)
+        x_excited = self.excitation(x_squeezed)
+        x_scaled = x * x_excited
+        return x_scaled
 
 class RegNet(nn.Module):
     pass

From d71014c51f9b40e062a987cffc80d15e80ec39b4 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Fri, 10 Sep 2021 23:21:17 +0000
Subject: [PATCH 20/40] add SqueezeExcitation

---
 torchvision/models/regnet.py | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index eff8273695a..aa41e3096c2 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,11 +1,18 @@
 from torch import nn, Tenspr
+from torchvision.models.mobilenetv2 import _make_divisible
+
+
+model_urls = {
+}
+
 
 class RegNetParams:
     pass
 
+
 class SqueezeExcitation(nn.Module):
     """
-    Squeeze and excitation layer from 
+    Squeeze and excitation layer from
     `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
     """
 
@@ -42,8 +49,20 @@ def forward(self, x: Tensor) -> Tensor:
         x_scaled = x * x_excited
         return x_scaled
 
+
 class RegNet(nn.Module):
     pass
 
+
+def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
+    model = RegNet()
+    if pretrained:
+        if arch not in model_urls:
+            raise ValueError(f"No checkpoint is available for model type {arch}")
+        state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
+        model.load_state_dict(state_dict)
+    return model
+
+
 def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    pass
+    return _regnet("regnet_y_400mf", pretrained, progress, **kwargs)

From b440ae4828642a72b879590229198fcf3a3de0cf Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Tue, 14 Sep 2021 05:57:30 +0000
Subject: [PATCH 21/40] regnet blocks, stems and model definition

---
 torchvision/models/regnet.py | 563 ++++++++++++++++++++++++++++++++++-
 1 file changed, 556 insertions(+), 7 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index aa41e3096c2..eb15da6e0ad 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,4 +1,11 @@
-from torch import nn, Tenspr
+import numpy as np
+import math
+import torch
+
+from collections import OrderedDict
+from enum import Enum, auto
+from typing import Any, List, Optional
+from torch import nn, Tensor
 from torchvision.models.mobilenetv2 import _make_divisible
 
 
@@ -6,11 +13,140 @@
 }
 
 
+# The different possible blocks
+class BlockType(Enum):
+    VANILLA_BLOCK = auto()
+    RES_BASIC_BLOCK = auto()
+    RES_BOTTLENECK_BLOCK = auto()
+    RES_BOTTLENECK_LINEAR_BLOCK = auto()
+
+
+# The different possible Stems
+class StemType(Enum):
+    RES_STEM_CIFAR = auto()
+    RES_STEM_IN = auto()
+    SIMPLE_STEM_IN = auto()
+
+
+# The different possible activations
+class ActivationType(Enum):
+    RELU = auto()
+    SILU = auto()
+
+
 class RegNetParams:
-    pass
+    def __init__(
+        self,
+        depth: int,
+        w_0: int,
+        w_a: float,
+        w_m: float,
+        group_width: int,
+        bottleneck_multiplier: float = 1.0,
+        stem_type: StemType = StemType.SIMPLE_STEM_IN,
+        stem_width: int = 32,
+        block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK,
+        activation: ActivationType = ActivationType.RELU,
+        use_se: bool = True,
+        se_ratio: float = 0.25,
+        bn_epsilon: float = 1e-05,
+        bn_momentum: bool = 0.1,
+    ) -> None:
+        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
+            raise ValueError("Invalid RegNet settings")
+        self.depth = depth
+        self.w_0 = w_0
+        self.w_a = w_a
+        self.w_m = w_m
+        self.group_width = group_width
+        self.bottleneck_multiplier = bottleneck_multiplier
+        self.stem_type = stem_type
+        self.block_type = block_type
+        self.activation = activation
+        self.stem_width = stem_width
+        self.use_se = use_se
+        self.se_ratio = se_ratio if use_se else None
+        self.bn_epsilon = bn_epsilon
+        self.bn_momentum = bn_momentum
+
+    def get_expanded_params(self):
+        """
+        Programatically compute all the per-block settings,
+        given the RegNet parameters.
+
+        The first step is to compute the quantized linear block parameters,
+        in log space. Key parameters are:
+        - `w_a` is the width progression slope
+        - `w_0` is the initial width
+        - `w_m` is the width stepping in the log space
+
+        In other terms
+        `log(block_width) = log(w_0) + w_m * block_capacity`,
+        with `bock_capacity` ramping up following the w_0 and w_a params.
+        This block width is finally quantized to multiples of 8.
+
+        The second step is to compute the parameters per stage,
+        taking into account the skip connection and the final 1x1 convolutions.
+        We use the fact that the output width is constant within a stage.
+        """
+
+        QUANT = 8
+        STRIDE = 2
+
+        # Compute the block widths. Each stage has one unique block width
+        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
+        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
+        block_widths = (
+            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
+            * QUANT
+        )
+        num_stages = len(np.unique(block_widths))
+        block_widths = block_widths.astype(int).tolist()
+
+        # Convert to per stage parameters
+        split_helper = zip(
+            block_widths + [0],
+            [0] + block_widths,
+            block_widths + [0],
+            [0] + block_widths,
+        )
+        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
+
+        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
+        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
 
+        strides = [STRIDE] * num_stages
+        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
+        group_widths = [self.group_width] * num_stages
 
-class SqueezeExcitation(nn.Module):
+        # Adjust the compatibility of stage widths and group widths
+        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
+            stage_widths, bottleneck_multipliers, group_widths
+        )
+
+        return zip(
+            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
+        )
+
+    @staticmethod
+    def _adjust_widths_groups_compatibilty(
+            stage_widths: List[int], bottleneck_ratios: List[float],
+            group_widths: List[int]) -> Tuple(List[int], List[int]):
+        """
+        Adjusts the compatibility of widths and groups,
+        depending on the bottleneck ratio.
+        """
+        # Compute all widths for the current settings
+        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
+        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
+
+        # Compute the adjusted widths so that stage and group widths fit
+        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
+        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
+        return stage_widths, group_widths_min
+
+
+class _SqueezeExcitation(nn.Module):
     """
     Squeeze and excitation layer from
     `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
@@ -50,12 +186,424 @@ def forward(self, x: Tensor) -> Tensor:
         return x_scaled
 
 
+class BasicTransform(nn.Sequential):
+    """Basic transformation: [3x3 conv, BN, Relu] x2."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+
+        self.a = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
+        )
+
+        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.depth = 2
+
+
+class ResStemCifar(nn.Sequential):
+    """ResNet stem for CIFAR: 3x3, BN, ReLU."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+        self.stem = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+        self.depth = 2
+
+
+class ResStemIN(nn.Sequential):
+    """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+        self.stem = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+            nn.MaxPool2d(3, stride=2, padding=1),
+        )
+        self.depth = 3
+
+
+class SimpleStemIN(nn.Sequential):
+    """Simple stem for ImageNet: 3x3, BN, ReLU."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+    ) -> None:
+        super().__init__()
+        self.stem = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+        self.depth = 2
+
+
+class VanillaBlock(nn.Sequential):
+    """Vanilla block: [3x3 conv, BN, Relu] x2."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.a = nn.Sequential(
+            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        self.b = nn.Sequential(
+            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
+            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        self.depth = 2
+
+
+class ResBasicBlock(nn.Module):
+    """Residual basic block: x + F(x), F = basic transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        *args,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.proj_block = (width_in != width_out) or (stride != 1)
+        if self.proj_block:
+            self.proj = nn.Conv2d(
+                width_in, width_out, 1, stride=stride, padding=0, bias=False
+            )
+            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.f = BasicTransform(
+            width_in, width_out, stride, bn_epsilon, bn_momentum, activation
+        )
+        self.activation = activation
+
+        # The projection and transform happen in parallel,
+        # and ReLU is not counted with respect to depth
+        self.depth = self.f.depth
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.proj_block:
+            x = self.bn(self.proj(x)) + self.f(x)
+        else:
+            x = x + self.f(x)
+
+        return self.activation(x)
+
+
+class BottleneckTransform(nn.Sequential):
+    """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        group_width: int,
+        bottleneck_multiplier: float,
+        se_ratio: Optional[float],
+    ) -> None:
+        super().__init__()
+        w_b = int(round(width_out * bottleneck_multiplier))
+        g = w_b // group_width
+
+        self.a = nn.Sequential(
+            nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False),
+            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        self.b = nn.Sequential(
+            nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False),
+            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
+            activation,
+        )
+
+        if se_ratio:
+            # The SE reduction ratio is defined with respect to the
+            # beginning of the block
+            width_se_out = int(round(se_ratio * width_in))
+            self.se = _SqueezeExcitation(
+                in_channels=w_b,
+                reduction_ratio=None,
+                reduced_channels=width_se_out,
+                activation=activation,
+            )
+
+        self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
+        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.depth = 3 if not se_ratio else 4
+
+
+class ResBottleneckBlock(nn.Module):
+    """Residual bottleneck block: x + F(x), F = bottleneck transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        group_width: int = 1,
+        bottleneck_multiplier: float = 1.0,
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+
+        # Use skip connection with projection if shape changes
+        self.proj_block = (width_in != width_out) or (stride != 1)
+        if self.proj_block:
+            self.proj = nn.Conv2d(
+                width_in, width_out, 1, stride=stride, padding=0, bias=False
+            )
+            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        self.f = BottleneckTransform(
+            width_in,
+            width_out,
+            stride,
+            bn_epsilon,
+            bn_momentum,
+            activation,
+            group_width,
+            bottleneck_multiplier,
+            se_ratio,
+        )
+        self.activation = activation
+
+        # The projection and transform happen in parallel,
+        # and activation is not counted with respect to depth
+        self.depth = self.f.depth
+
+    def forward(self, x: Tensor) -> Tensor:
+        if self.proj_block:
+            x = self.bn(self.proj(x)) + self.f(x)
+        else:
+            x = x + self.f(x)
+        return self.activation(x)
+
+
+class ResBottleneckLinearBlock(nn.Module):
+    """Residual linear bottleneck block: x + F(x), F = bottleneck transform."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        bn_epsilon: float,
+        bn_momentum: float,
+        activation: nn.Module,
+        group_width: int = 1,
+        bottleneck_multiplier: float = 4.0,
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        super().__init__()
+        self.has_skip = (width_in == width_out) and (stride == 1)
+        self.f = BottleneckTransform(
+            width_in,
+            width_out,
+            stride,
+            bn_epsilon,
+            bn_momentum,
+            activation,
+            group_width,
+            bottleneck_multiplier,
+            se_ratio,
+        )
+
+        self.depth = self.f.depth
+
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.f(x) if self.has_skip else self.f(x)
+
+
+class AnyStage(nn.Sequential):
+    """AnyNet stage (sequence of blocks w/ the same output shape)."""
+
+    def __init__(
+        self,
+        width_in: int,
+        width_out: int,
+        stride: int,
+        depth: int,
+        block_constructor: nn.Module,
+        activation: nn.Module,
+        group_width: int,
+        bottleneck_multiplier: float,
+        params: "AnyNetParams",
+        stage_index: int = 0,
+    ) -> None:
+        super().__init__()
+        self.stage_depth = 0
+
+        for i in range(depth):
+            block = block_constructor(
+                width_in if i == 0 else width_out,
+                width_out,
+                stride if i == 0 else 1,
+                params.bn_epsilon,
+                params.bn_momentum,
+                activation,
+                group_width,
+                bottleneck_multiplier,
+                params.se_ratio,
+            )
+
+            self.stage_depth += block.depth
+            self.add_module(f"block{stage_index}-{i}", block)
+
+
 class RegNet(nn.Module):
-    pass
+    def __init__(self, params: RegNetParams) -> None:
+        super().__init__()
+
+        if params.activation == ActivationType.SILU and torch.__version__ < "1.7":
+            raise ValueError("SiLU activation is only supported since PyTorch 1.7")
+
+        silu = None if torch.__version__ < "1.7" else nn.SiLU()
+        activation = {
+            ActivationType.RELU: nn.ReLU(inplace=True),
+            ActivationType.SILU: silu,
+        }[params.activation]
+
+        # Ad hoc stem
+        self.stem = {
+            StemType.RES_STEM_CIFAR: ResStemCifar,
+            StemType.RES_STEM_IN: ResStemIN,
+            StemType.SIMPLE_STEM_IN: SimpleStemIN,
+        }[params.stem_type](
+            3,  # width_in
+            params.stem_width,
+            params.bn_epsilon,
+            params.bn_momentum,
+            activation,
+        )
+
+        # Instantiate all the AnyNet blocks in the trunk
+        block_fun = {
+            BlockType.VANILLA_BLOCK: VanillaBlock,
+            BlockType.RES_BASIC_BLOCK: ResBasicBlock,
+            BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock,
+            BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock,
+        }[params.block_type]
+
+        current_width = params.stem_width
+
+        self.trunk_depth = 0
+
+        blocks = []
+        for i, (
+            width_out,
+            stride,
+            depth,
+            group_width,
+            bottleneck_multiplier,
+        ) in enumerate(params.get_expanded_params()):
+            blocks.append(
+                (
+                    f"block{i+1}",
+                    AnyStage(
+                        current_width,
+                        width_out,
+                        stride,
+                        depth,
+                        block_fun,
+                        activation,
+                        group_width,
+                        bottleneck_multiplier,
+                        params,
+                        stage_index=i + 1,
+                    ),
+                )
+            )
+
+            self.trunk_depth += blocks[-1][1].stage_depth
+
+            current_width = width_out
+
+        self.trunk_output = nn.Sequential(OrderedDict(blocks))
+
+        # Init weights and good to go
+        self._init_weights()
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.stem(x)
+        x = self.trunk_output(x)
+   
+        return x
+
+    def _init_weights(self) -> None:
+        # Performs ResNet-style weight initialization
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                # Note that there is no bias due to BN
+                fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out))
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1.0)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                m.weight.data.normal_(mean=0.0, std=0.01)
+                m.bias.data.zero_()
 
 
-def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
-    model = RegNet()
+def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
+    model = RegNet(params)
     if pretrained:
         if arch not in model_urls:
             raise ValueError(f"No checkpoint is available for model type {arch}")
@@ -65,4 +613,5 @@ def _regnet(arch: str, pretrained: bool, progress: bool, **kwargs: Any) -> RegNe
 
 
 def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
-    return _regnet("regnet_y_400mf", pretrained, progress, **kwargs)
+    params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
+    return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)

From 0dc5bc8549751af75456a503c327c8c1937e8d09 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Tue, 14 Sep 2021 06:21:00 +0000
Subject: [PATCH 22/40] nit

---
 torchvision/models/regnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index eb15da6e0ad..80fed6d6a6e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -4,7 +4,7 @@
 
 from collections import OrderedDict
 from enum import Enum, auto
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple
 from torch import nn, Tensor
 from torchvision.models.mobilenetv2 import _make_divisible
 
@@ -131,7 +131,7 @@ def get_expanded_params(self):
     @staticmethod
     def _adjust_widths_groups_compatibilty(
             stage_widths: List[int], bottleneck_ratios: List[float],
-            group_widths: List[int]) -> Tuple(List[int], List[int]):
+            group_widths: List[int]) -> Tuple[List[int], List[int]]:
         """
         Adjusts the compatibility of widths and groups,
         depending on the bottleneck ratio.

From e02d886c1224dbd953d962a5e87c62261888284f Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Tue, 14 Sep 2021 23:06:53 +0000
Subject: [PATCH 23/40] add fc layer

---
 torchvision/models/regnet.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 80fed6d6a6e..5d813d59b9e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -51,6 +51,7 @@ def __init__(
         se_ratio: float = 0.25,
         bn_epsilon: float = 1e-05,
         bn_momentum: bool = 0.1,
+        num_classes: int = 1000,
     ) -> None:
         if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
             raise ValueError("Invalid RegNet settings")
@@ -68,6 +69,7 @@ def __init__(
         self.se_ratio = se_ratio if use_se else None
         self.bn_epsilon = bn_epsilon
         self.bn_momentum = bn_momentum
+        self.num_classes = num_classes
 
     def get_expanded_params(self):
         """
@@ -578,13 +580,20 @@ def __init__(self, params: RegNetParams) -> None:
 
         self.trunk_output = nn.Sequential(OrderedDict(blocks))
 
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes)
+
         # Init weights and good to go
         self._init_weights()
 
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
         x = self.trunk_output(x)
-   
+
+        x = self.avgpool(x)
+        x = x.flatten(start_dim=1)
+        x = self.fc(x)
+
         return x
 
     def _init_weights(self) -> None:

From 5a6c7294032b5fca9d326c5b01df5637b1c4a0aa Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 15 Sep 2021 07:19:45 +0000
Subject: [PATCH 24/40] use Callable instead of Enum for block, stem and
 activation

---
 torchvision/models/regnet.py | 296 ++++++++++++++++-------------------
 1 file changed, 132 insertions(+), 164 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 5d813d59b9e..76d3381fe5b 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -4,150 +4,19 @@
 
 from collections import OrderedDict
 from enum import Enum, auto
-from typing import Any, List, Optional, Tuple
+from typing import Any, Callable, List, Optional, Tuple
 from torch import nn, Tensor
+
+from .._internally_replaced_utils import load_state_dict_from_url
 from torchvision.models.mobilenetv2 import _make_divisible
 
 
 model_urls = {
+    # TODO(kazhang): add pretrained weights
+    "regnet_y_400m": "",
 }
 
 
-# The different possible blocks
-class BlockType(Enum):
-    VANILLA_BLOCK = auto()
-    RES_BASIC_BLOCK = auto()
-    RES_BOTTLENECK_BLOCK = auto()
-    RES_BOTTLENECK_LINEAR_BLOCK = auto()
-
-
-# The different possible Stems
-class StemType(Enum):
-    RES_STEM_CIFAR = auto()
-    RES_STEM_IN = auto()
-    SIMPLE_STEM_IN = auto()
-
-
-# The different possible activations
-class ActivationType(Enum):
-    RELU = auto()
-    SILU = auto()
-
-
-class RegNetParams:
-    def __init__(
-        self,
-        depth: int,
-        w_0: int,
-        w_a: float,
-        w_m: float,
-        group_width: int,
-        bottleneck_multiplier: float = 1.0,
-        stem_type: StemType = StemType.SIMPLE_STEM_IN,
-        stem_width: int = 32,
-        block_type: BlockType = BlockType.RES_BOTTLENECK_BLOCK,
-        activation: ActivationType = ActivationType.RELU,
-        use_se: bool = True,
-        se_ratio: float = 0.25,
-        bn_epsilon: float = 1e-05,
-        bn_momentum: bool = 0.1,
-        num_classes: int = 1000,
-    ) -> None:
-        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
-            raise ValueError("Invalid RegNet settings")
-        self.depth = depth
-        self.w_0 = w_0
-        self.w_a = w_a
-        self.w_m = w_m
-        self.group_width = group_width
-        self.bottleneck_multiplier = bottleneck_multiplier
-        self.stem_type = stem_type
-        self.block_type = block_type
-        self.activation = activation
-        self.stem_width = stem_width
-        self.use_se = use_se
-        self.se_ratio = se_ratio if use_se else None
-        self.bn_epsilon = bn_epsilon
-        self.bn_momentum = bn_momentum
-        self.num_classes = num_classes
-
-    def get_expanded_params(self):
-        """
-        Programatically compute all the per-block settings,
-        given the RegNet parameters.
-
-        The first step is to compute the quantized linear block parameters,
-        in log space. Key parameters are:
-        - `w_a` is the width progression slope
-        - `w_0` is the initial width
-        - `w_m` is the width stepping in the log space
-
-        In other terms
-        `log(block_width) = log(w_0) + w_m * block_capacity`,
-        with `bock_capacity` ramping up following the w_0 and w_a params.
-        This block width is finally quantized to multiples of 8.
-
-        The second step is to compute the parameters per stage,
-        taking into account the skip connection and the final 1x1 convolutions.
-        We use the fact that the output width is constant within a stage.
-        """
-
-        QUANT = 8
-        STRIDE = 2
-
-        # Compute the block widths. Each stage has one unique block width
-        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
-        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
-        block_widths = (
-            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
-            * QUANT
-        )
-        num_stages = len(np.unique(block_widths))
-        block_widths = block_widths.astype(int).tolist()
-
-        # Convert to per stage parameters
-        split_helper = zip(
-            block_widths + [0],
-            [0] + block_widths,
-            block_widths + [0],
-            [0] + block_widths,
-        )
-        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
-
-        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
-        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
-
-        strides = [STRIDE] * num_stages
-        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
-        group_widths = [self.group_width] * num_stages
-
-        # Adjust the compatibility of stage widths and group widths
-        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
-            stage_widths, bottleneck_multipliers, group_widths
-        )
-
-        return zip(
-            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
-        )
-
-    @staticmethod
-    def _adjust_widths_groups_compatibilty(
-            stage_widths: List[int], bottleneck_ratios: List[float],
-            group_widths: List[int]) -> Tuple[List[int], List[int]]:
-        """
-        Adjusts the compatibility of widths and groups,
-        depending on the bottleneck ratio.
-        """
-        # Compute all widths for the current settings
-        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
-        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
-
-        # Compute the adjusted widths so that stage and group widths fit
-        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
-        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
-        return stage_widths, group_widths_min
-
-
 class _SqueezeExcitation(nn.Module):
     """
     Squeeze and excitation layer from
@@ -483,11 +352,13 @@ def __init__(
         width_out: int,
         stride: int,
         depth: int,
-        block_constructor: nn.Module,
+        block_constructor: Callable[..., nn.Module],
+        bn_epsilon: float,
+        bn_momentum: float,
         activation: nn.Module,
         group_width: int,
         bottleneck_multiplier: float,
-        params: "AnyNetParams",
+        se_ratio: Optional[float] = None,
         stage_index: int = 0,
     ) -> None:
         super().__init__()
@@ -498,37 +369,140 @@ def __init__(
                 width_in if i == 0 else width_out,
                 width_out,
                 stride if i == 0 else 1,
-                params.bn_epsilon,
-                params.bn_momentum,
+                bn_epsilon,
+                bn_momentum,
                 activation,
                 group_width,
                 bottleneck_multiplier,
-                params.se_ratio,
+                se_ratio,
             )
 
             self.stage_depth += block.depth
             self.add_module(f"block{stage_index}-{i}", block)
 
 
+class RegNetParams:
+    def __init__(
+        self,
+        depth: int,
+        w_0: int,
+        w_a: float,
+        w_m: float,
+        group_width: int,
+        bottleneck_multiplier: float = 1.0,
+        stem_type: Callable[..., nn.Module] = SimpleStemIN,
+        stem_width: int = 32,
+        block_type: Callable[..., nn.Module] = ResBottleneckBlock,
+        activation: Callable[..., nn.Module] = nn.ReLU,
+        use_se: bool = True,
+        se_ratio: float = 0.25,
+        bn_epsilon: float = 1e-05,
+        bn_momentum: float = 0.1,
+        num_classes: int = 1000,
+    ) -> None:
+        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
+            raise ValueError("Invalid RegNet settings")
+        self.depth = depth
+        self.w_0 = w_0
+        self.w_a = w_a
+        self.w_m = w_m
+        self.group_width = group_width
+        self.bottleneck_multiplier = bottleneck_multiplier
+        self.stem_type = stem_type
+        self.block_type = block_type
+        self.activation = activation
+        self.stem_width = stem_width
+        self.use_se = use_se
+        self.se_ratio = se_ratio if use_se else None
+        self.bn_epsilon = bn_epsilon
+        self.bn_momentum = bn_momentum
+        self.num_classes = num_classes
+
+    def get_expanded_params(self):
+        """
+        Programatically compute all the per-block settings,
+        given the RegNet parameters.
+
+        The first step is to compute the quantized linear block parameters,
+        in log space. Key parameters are:
+        - `w_a` is the width progression slope
+        - `w_0` is the initial width
+        - `w_m` is the width stepping in the log space
+
+        In other terms
+        `log(block_width) = log(w_0) + w_m * block_capacity`,
+        with `bock_capacity` ramping up following the w_0 and w_a params.
+        This block width is finally quantized to multiples of 8.
+
+        The second step is to compute the parameters per stage,
+        taking into account the skip connection and the final 1x1 convolutions.
+        We use the fact that the output width is constant within a stage.
+        """
+
+        QUANT = 8
+        STRIDE = 2
+
+        # Compute the block widths. Each stage has one unique block width
+        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
+        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
+        block_widths = (
+            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
+            * QUANT
+        )
+        num_stages = len(np.unique(block_widths))
+        block_widths = block_widths.astype(int).tolist()
+
+        # Convert to per stage parameters
+        split_helper = zip(
+            block_widths + [0],
+            [0] + block_widths,
+            block_widths + [0],
+            [0] + block_widths,
+        )
+        splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
+
+        stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
+        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
+
+        strides = [STRIDE] * num_stages
+        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
+        group_widths = [self.group_width] * num_stages
+
+        # Adjust the compatibility of stage widths and group widths
+        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
+            stage_widths, bottleneck_multipliers, group_widths
+        )
+
+        return zip(
+            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
+        )
+
+    @staticmethod
+    def _adjust_widths_groups_compatibilty(
+            stage_widths: List[int], bottleneck_ratios: List[float],
+            group_widths: List[int]) -> Tuple[List[int], List[int]]:
+        """
+        Adjusts the compatibility of widths and groups,
+        depending on the bottleneck ratio.
+        """
+        # Compute all widths for the current settings
+        widths = [int(w * b) for w, b in zip(stage_widths, bottleneck_ratios)]
+        group_widths_min = [min(g, w_bot) for g, w_bot in zip(group_widths, widths)]
+
+        # Compute the adjusted widths so that stage and group widths fit
+        ws_bot = [_make_divisible(w_bot, g) for w_bot, g in zip(widths, group_widths_min)]
+        stage_widths = [int(w_bot / b) for w_bot, b in zip(ws_bot, bottleneck_ratios)]
+        return stage_widths, group_widths_min
+
+
 class RegNet(nn.Module):
     def __init__(self, params: RegNetParams) -> None:
         super().__init__()
 
-        if params.activation == ActivationType.SILU and torch.__version__ < "1.7":
-            raise ValueError("SiLU activation is only supported since PyTorch 1.7")
-
-        silu = None if torch.__version__ < "1.7" else nn.SiLU()
-        activation = {
-            ActivationType.RELU: nn.ReLU(inplace=True),
-            ActivationType.SILU: silu,
-        }[params.activation]
+        activation = params.activation(inplace=True)
 
         # Ad hoc stem
-        self.stem = {
-            StemType.RES_STEM_CIFAR: ResStemCifar,
-            StemType.RES_STEM_IN: ResStemIN,
-            StemType.SIMPLE_STEM_IN: SimpleStemIN,
-        }[params.stem_type](
+        self.stem = params.stem_type(
             3,  # width_in
             params.stem_width,
             params.bn_epsilon,
@@ -536,14 +510,6 @@ def __init__(self, params: RegNetParams) -> None:
             activation,
         )
 
-        # Instantiate all the AnyNet blocks in the trunk
-        block_fun = {
-            BlockType.VANILLA_BLOCK: VanillaBlock,
-            BlockType.RES_BASIC_BLOCK: ResBasicBlock,
-            BlockType.RES_BOTTLENECK_BLOCK: ResBottleneckBlock,
-            BlockType.RES_BOTTLENECK_LINEAR_BLOCK: ResBottleneckLinearBlock,
-        }[params.block_type]
-
         current_width = params.stem_width
 
         self.trunk_depth = 0
@@ -564,11 +530,13 @@ def __init__(self, params: RegNetParams) -> None:
                         width_out,
                         stride,
                         depth,
-                        block_fun,
+                        params.block_type,
+                        params.bn_epsilon,
+                        params.bn_momentum,
                         activation,
                         group_width,
                         bottleneck_multiplier,
-                        params,
+                        params.se_ratio,
                         stage_index=i + 1,
                     ),
                 )

From 48a6e36479f8f8bbdd285386d980cc11efe3e727 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 15 Sep 2021 21:30:52 +0000
Subject: [PATCH 25/40] add regnet_x and regnet_y model build functions, add
 docs

---
 torchvision/models/regnet.py | 193 ++++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 1 deletion(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 76d3381fe5b..98c60296015 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -1,3 +1,8 @@
+# Modified from
+# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/anynet.py
+# https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
+
+
 import numpy as np
 import math
 import torch
@@ -10,7 +15,6 @@
 from .._internally_replaced_utils import load_state_dict_from_url
 from torchvision.models.mobilenetv2 import _make_divisible
 
-
 model_urls = {
     # TODO(kazhang): add pretrained weights
     "regnet_y_400m": "",
@@ -590,5 +594,192 @@ def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, *
 
 
 def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_400MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
     params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
     return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_800MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs)
+    return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_1.6GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs)
+    return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_3.2GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs)
+    return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_8GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs)
+    return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_16GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs)
+    return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetY_32GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs)
+    return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_400MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
+                          group_width=16, use_se=False, **kwargs)
+
+    return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_800MF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_1.6GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_3.2GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_8GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_16GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs)
+
+
+def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> RegNet:
+    """
+    Constructs a RegNetX_32GF architecture from
+    `"Designing Network Design Spaces" <https://arxiv.org/abs/2003.13678>`_.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+        progress (bool): If True, displays a progress bar of the download to stderr
+    """
+    params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168,
+                          use_se=False, **kwargs)
+    return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs)
+
+# TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF

From 2dbcd6d7dfbae4056ed79d3ffe35ed80d3a09fea Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 16 Sep 2021 23:25:42 +0000
Subject: [PATCH 26/40] remove unused depth

---
 torchvision/models/regnet.py | 22 ----------------------
 1 file changed, 22 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 98c60296015..d405945267e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -8,7 +8,6 @@
 import torch
 
 from collections import OrderedDict
-from enum import Enum, auto
 from typing import Any, Callable, List, Optional, Tuple
 from torch import nn, Tensor
 
@@ -83,7 +82,6 @@ def __init__(
         )
 
         self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
-        self.depth = 2
 
 
 class ResStemCifar(nn.Sequential):
@@ -103,7 +101,6 @@ def __init__(
             nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
             activation,
         )
-        self.depth = 2
 
 
 class ResStemIN(nn.Sequential):
@@ -124,7 +121,6 @@ def __init__(
             activation,
             nn.MaxPool2d(3, stride=2, padding=1),
         )
-        self.depth = 3
 
 
 class SimpleStemIN(nn.Sequential):
@@ -144,7 +140,6 @@ def __init__(
             nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
             activation,
         )
-        self.depth = 2
 
 
 class VanillaBlock(nn.Sequential):
@@ -174,8 +169,6 @@ def __init__(
             activation,
         )
 
-        self.depth = 2
-
 
 class ResBasicBlock(nn.Module):
     """Residual basic block: x + F(x), F = basic transform."""
@@ -203,10 +196,6 @@ def __init__(
         )
         self.activation = activation
 
-        # The projection and transform happen in parallel,
-        # and ReLU is not counted with respect to depth
-        self.depth = self.f.depth
-
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
             x = self.bn(self.proj(x)) + self.f(x)
@@ -260,7 +249,6 @@ def __init__(
 
         self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
         self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
-        self.depth = 3 if not se_ratio else 4
 
 
 class ResBottleneckBlock(nn.Module):
@@ -302,7 +290,6 @@ def __init__(
 
         # The projection and transform happen in parallel,
         # and activation is not counted with respect to depth
-        self.depth = self.f.depth
 
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
@@ -341,8 +328,6 @@ def __init__(
             se_ratio,
         )
 
-        self.depth = self.f.depth
-
     def forward(self, x: Tensor) -> Tensor:
         return x + self.f(x) if self.has_skip else self.f(x)
 
@@ -366,7 +351,6 @@ def __init__(
         stage_index: int = 0,
     ) -> None:
         super().__init__()
-        self.stage_depth = 0
 
         for i in range(depth):
             block = block_constructor(
@@ -381,7 +365,6 @@ def __init__(
                 se_ratio,
             )
 
-            self.stage_depth += block.depth
             self.add_module(f"block{stage_index}-{i}", block)
 
 
@@ -516,8 +499,6 @@ def __init__(self, params: RegNetParams) -> None:
 
         current_width = params.stem_width
 
-        self.trunk_depth = 0
-
         blocks = []
         for i, (
             width_out,
@@ -546,8 +527,6 @@ def __init__(self, params: RegNetParams) -> None:
                 )
             )
 
-            self.trunk_depth += blocks[-1][1].stage_depth
-
             current_width = width_out
 
         self.trunk_output = nn.Sequential(OrderedDict(blocks))
@@ -695,7 +674,6 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An
     """
     params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
                           group_width=16, use_se=False, **kwargs)
-
     return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
 
 
From baca24fee5fd3a9d2308bccbbb37ae89d5b9eba5 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Fri, 17 Sep 2021 07:10:50 +0000
Subject: [PATCH 27/40] use BN/activation constructor and ConvBNActivation

---
 torchvision/models/regnet.py | 187 +++++++++++++----------------------
 1 file changed, 66 insertions(+), 121 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index d405945267e..9c3330ec726 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -8,11 +8,12 @@
 import torch
 
 from collections import OrderedDict
+from functools import partial
 from typing import Any, Callable, List, Optional, Tuple
 from torch import nn, Tensor
 
 from .._internally_replaced_utils import load_state_dict_from_url
-from torchvision.models.mobilenetv2 import _make_divisible
+from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible
 
 model_urls = {
     # TODO(kazhang): add pretrained weights
@@ -68,39 +69,17 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
     ) -> None:
-        super().__init__()
-
-        self.a = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
-        )
-
-        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
-
-
-class ResStemCifar(nn.Sequential):
-    """ResNet stem for CIFAR: 3x3, BN, ReLU."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
-    ) -> None:
-        super().__init__()
-        self.stem = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        super().__init__(OrderedDict(
+            a=nn.Sequential(
+                ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
+                                 norm_layer=norm_layer, activation_layer=activation_layer),
+                nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
+            ),
+            final_bn=norm_layer(width_out),
+        ))
 
 
 class ResStemIN(nn.Sequential):
@@ -110,36 +89,28 @@ def __init__(
         self,
         width_in: int,
         width_out: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
     ) -> None:
-        super().__init__()
-        self.stem = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 7, stride=2, padding=3, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
+        super().__init__(
+            ConvBNActivation(width_in, width_out, kernel_size=7, stride=2,
+                             norm_layer=norm_layer, activation_layer=activation_layer),
             nn.MaxPool2d(3, stride=2, padding=1),
         )
 
 
-class SimpleStemIN(nn.Sequential):
+class SimpleStemIN(ConvBNActivation):
     """Simple stem for ImageNet: 3x3, BN, ReLU."""
 
     def __init__(
         self,
         width_in: int,
         width_out: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
     ) -> None:
-        super().__init__()
-        self.stem = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=2, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        super().__init__(width_in, width_out, kernel_size=3, stride=2,
+                         norm_layer=norm_layer, activation_layer=activation_layer)
 
 
 class VanillaBlock(nn.Sequential):
@@ -150,24 +121,17 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         *args,
         **kwargs,
     ) -> None:
-        super().__init__()
-        self.a = nn.Sequential(
-            nn.Conv2d(width_in, width_out, 3, stride=stride, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
-
-        self.b = nn.Sequential(
-            nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
-            nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        super().__init__(OrderedDict(
+            a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
+                               norm_layer=norm_layer, activation_layer=activation_layer),
+            b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1,
+                               norm_layer=norm_layer, activation_layer=activation_layer),
+        ))
 
 
 class ResBasicBlock(nn.Module):
@@ -178,9 +142,8 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         *args,
         **kwargs,
     ) -> None:
@@ -190,11 +153,11 @@ def __init__(
             self.proj = nn.Conv2d(
                 width_in, width_out, 1, stride=stride, padding=0, bias=False
             )
-            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+            self.bn = norm_layer(width_out)
         self.f = BasicTransform(
-            width_in, width_out, stride, bn_epsilon, bn_momentum, activation
+            width_in, width_out, stride, norm_layer, activation_layer
         )
-        self.activation = activation
+        self.activation = activation_layer(inplace=True)
 
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
@@ -213,42 +176,35 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int,
         bottleneck_multiplier: float,
         se_ratio: Optional[float],
     ) -> None:
-        super().__init__()
+        layers = OrderedDict()
         w_b = int(round(width_out * bottleneck_multiplier))
         g = w_b // group_width
 
-        self.a = nn.Sequential(
-            nn.Conv2d(width_in, w_b, 1, stride=1, padding=0, bias=False),
-            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
-
-        self.b = nn.Sequential(
-            nn.Conv2d(w_b, w_b, 3, stride=stride, padding=1, groups=g, bias=False),
-            nn.BatchNorm2d(w_b, eps=bn_epsilon, momentum=bn_momentum),
-            activation,
-        )
+        layers["a"] = ConvBNActivation(width_in, w_b, kernel_size=1, stride=1,
+                                       norm_layer=norm_layer, activation_layer=activation_layer)
+        layers["b"] = ConvBNActivation(w_b, w_b, kernel_size=3, stride=stride, groups=g,
+                                       norm_layer=norm_layer, activation_layer=activation_layer)
 
         if se_ratio:
             # The SE reduction ratio is defined with respect to the
             # beginning of the block
             width_se_out = int(round(se_ratio * width_in))
-            self.se = _SqueezeExcitation(
+            layers["se"] = _SqueezeExcitation(
                 in_channels=w_b,
                 reduction_ratio=None,
                 reduced_channels=width_se_out,
-                activation=activation,
+                activation=activation_layer(inplace=True),
             )
 
-        self.c = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
-        self.final_bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+        layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
+        layers["final_bn"] = norm_layer(width_out)
+        super().__init__(layers)
 
 
 class ResBottleneckBlock(nn.Module):
@@ -259,9 +215,8 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int = 1,
         bottleneck_multiplier: float = 1.0,
         se_ratio: Optional[float] = None,
@@ -274,19 +229,18 @@ def __init__(
             self.proj = nn.Conv2d(
                 width_in, width_out, 1, stride=stride, padding=0, bias=False
             )
-            self.bn = nn.BatchNorm2d(width_out, eps=bn_epsilon, momentum=bn_momentum)
+            self.bn = norm_layer(width_out)
         self.f = BottleneckTransform(
             width_in,
             width_out,
             stride,
-            bn_epsilon,
-            bn_momentum,
-            activation,
+            norm_layer,
+            activation_layer,
             group_width,
             bottleneck_multiplier,
             se_ratio,
         )
-        self.activation = activation
+        self.activation = activation_layer(inplace=True)
 
         # The projection and transform happen in parallel,
         # and activation is not counted with respect to depth
@@ -307,9 +261,8 @@ def __init__(
         width_in: int,
         width_out: int,
         stride: int,
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int = 1,
         bottleneck_multiplier: float = 4.0,
         se_ratio: Optional[float] = None,
@@ -320,9 +273,8 @@ def __init__(
             width_in,
             width_out,
             stride,
-            bn_epsilon,
-            bn_momentum,
-            activation,
+            norm_layer,
+            activation_layer,
             group_width,
             bottleneck_multiplier,
             se_ratio,
@@ -342,9 +294,8 @@ def __init__(
         stride: int,
         depth: int,
         block_constructor: Callable[..., nn.Module],
-        bn_epsilon: float,
-        bn_momentum: float,
-        activation: nn.Module,
+        norm_layer: Callable[..., nn.Module],
+        activation_layer: Callable[..., nn.Module],
         group_width: int,
         bottleneck_multiplier: float,
         se_ratio: Optional[float] = None,
@@ -357,9 +308,8 @@ def __init__(
                 width_in if i == 0 else width_out,
                 width_out,
                 stride if i == 0 else 1,
-                bn_epsilon,
-                bn_momentum,
-                activation,
+                norm_layer,
+                activation_layer,
                 group_width,
                 bottleneck_multiplier,
                 se_ratio,
@@ -398,11 +348,10 @@ def __init__(
         self.stem_type = stem_type
         self.block_type = block_type
         self.activation = activation
+        self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum)
         self.stem_width = stem_width
         self.use_se = use_se
         self.se_ratio = se_ratio if use_se else None
-        self.bn_epsilon = bn_epsilon
-        self.bn_momentum = bn_momentum
         self.num_classes = num_classes
 
     def get_expanded_params(self):
@@ -486,15 +435,12 @@ class RegNet(nn.Module):
     def __init__(self, params: RegNetParams) -> None:
         super().__init__()
 
-        activation = params.activation(inplace=True)
-
         # Ad hoc stem
         self.stem = params.stem_type(
             3,  # width_in
             params.stem_width,
-            params.bn_epsilon,
-            params.bn_momentum,
-            activation,
+            params.norm_layer,
+            params.activation,
         )
 
         current_width = params.stem_width
@@ -516,9 +462,8 @@ def __init__(self, params: RegNetParams) -> None:
                         stride,
                         depth,
                         params.block_type,
-                        params.bn_epsilon,
-                        params.bn_momentum,
-                        activation,
+                        params.norm_layer,
+                        params.activation,
                         group_width,
                         bottleneck_multiplier,
                         params.se_ratio,

From 233bdff2c1c256cec38d2b1a4c7cf747c5694051 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Mon, 20 Sep 2021 00:02:08 +0000
Subject: [PATCH 28/40] reuse SqueezeExcitation from efficientnet

---
 torchvision/models/regnet.py | 50 ++++--------------------------------
 1 file changed, 5 insertions(+), 45 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 9c3330ec726..32b4bc9d4b2 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -14,6 +14,7 @@
 
 from .._internally_replaced_utils import load_state_dict_from_url
 from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible
+from torchvision.models.efficientnet import SqueezeExcitation
 
 model_urls = {
     # TODO(kazhang): add pretrained weights
@@ -21,46 +22,6 @@
 }
 
 
-class _SqueezeExcitation(nn.Module):
-    """
-    Squeeze and excitation layer from
-    `"Squeeze-and-Excitation Networks" <https://arxiv.org/pdf/1709.01507>`_.
-    """
-
-    def __init__(
-        self,
-        in_channels: int,
-        reduction_ratio: Optional[int] = 16,
-        reduced_channels: Optional[int] = None,
-        activation: Optional[nn.Module] = None,
-    ) -> None:
-        super().__init__()
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-
-        # Either reduction_ratio is defined, or out_channels is defined,
-        # neither both nor none of them
-        assert bool(reduction_ratio) != bool(reduced_channels)
-
-        if activation is None:
-            activation = nn.ReLU()
-
-        reduced_channels = (
-            in_channels // reduction_ratio if reduced_channels is None else reduced_channels
-        )
-        self.excitation = nn.Sequential(
-            nn.Conv2d(in_channels, reduced_channels, kernel_size=1, stride=1, bias=True),
-            activation,
-            nn.Conv2d(reduced_channels, in_channels, kernel_size=1, stride=1, bias=True),
-            nn.Sigmoid(),
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        x_squeezed = self.avgpool(x)
-        x_excited = self.excitation(x_squeezed)
-        x_scaled = x * x_excited
-        return x_scaled
-
-
 class BasicTransform(nn.Sequential):
     """Basic transformation: [3x3 conv, BN, Relu] x2."""
 
@@ -195,11 +156,10 @@ def __init__(
             # The SE reduction ratio is defined with respect to the
             # beginning of the block
             width_se_out = int(round(se_ratio * width_in))
-            layers["se"] = _SqueezeExcitation(
-                in_channels=w_b,
-                reduction_ratio=None,
-                reduced_channels=width_se_out,
-                activation=activation_layer(inplace=True),
+            layers["se"] = SqueezeExcitation(
+                input_channels=w_b,
+                squeeze_channels=width_se_out,
+                activation=activation_layer,
             )
 
         layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)

From 0968d279ba072136a4d28b6c8ef2a7951cec847d Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Mon, 20 Sep 2021 21:04:35 +0000
Subject: [PATCH 29/40] refactor RegNetParams into BlockParams

---
 torchvision/models/regnet.py | 103 ++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 49 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 32b4bc9d4b2..7e94bf04978 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -278,7 +278,7 @@ def __init__(
             self.add_module(f"block{stage_index}-{i}", block)
 
 
-class RegNetParams:
+class BlockParams:
     def __init__(
         self,
         depth: int,
@@ -287,15 +287,8 @@ def __init__(
         w_m: float,
         group_width: int,
         bottleneck_multiplier: float = 1.0,
-        stem_type: Callable[..., nn.Module] = SimpleStemIN,
-        stem_width: int = 32,
-        block_type: Callable[..., nn.Module] = ResBottleneckBlock,
-        activation: Callable[..., nn.Module] = nn.ReLU,
         use_se: bool = True,
         se_ratio: float = 0.25,
-        bn_epsilon: float = 1e-05,
-        bn_momentum: float = 0.1,
-        num_classes: int = 1000,
     ) -> None:
         if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
             raise ValueError("Invalid RegNet settings")
@@ -305,14 +298,8 @@ def __init__(
         self.w_m = w_m
         self.group_width = group_width
         self.bottleneck_multiplier = bottleneck_multiplier
-        self.stem_type = stem_type
-        self.block_type = block_type
-        self.activation = activation
-        self.norm_layer = partial(nn.BatchNorm2d, eps=bn_epsilon, momentum=bn_momentum)
-        self.stem_width = stem_width
         self.use_se = use_se
         self.se_ratio = se_ratio if use_se else None
-        self.num_classes = num_classes
 
     def get_expanded_params(self):
         """
@@ -392,18 +379,36 @@ def _adjust_widths_groups_compatibilty(
 
 
 class RegNet(nn.Module):
-    def __init__(self, params: RegNetParams) -> None:
+    def __init__(
+        self,
+        block_params: BlockParams,
+        num_classes: int = 1000,
+        stem_width: int = 32,
+        stem_type: Optional[Callable[..., nn.Module]] = None,
+        block_type: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
         super().__init__()
 
+        if stem_type is None:
+            stem_type = SimpleStemIN
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if block_type is None:
+            block_type = ResBottleneckBlock
+        if activation is None:
+            activation = nn.ReLU
+
         # Ad hoc stem
-        self.stem = params.stem_type(
+        self.stem = stem_type(
             3,  # width_in
-            params.stem_width,
-            params.norm_layer,
-            params.activation,
+            stem_width,
+            norm_layer,
+            activation,
         )
 
-        current_width = params.stem_width
+        current_width = stem_width
 
         blocks = []
         for i, (
@@ -412,7 +417,7 @@ def __init__(self, params: RegNetParams) -> None:
             depth,
             group_width,
             bottleneck_multiplier,
-        ) in enumerate(params.get_expanded_params()):
+        ) in enumerate(block_params.get_expanded_params()):
             blocks.append(
                 (
                     f"block{i+1}",
@@ -421,12 +426,12 @@ def __init__(self, params: RegNetParams) -> None:
                         width_out,
                         stride,
                         depth,
-                        params.block_type,
-                        params.norm_layer,
-                        params.activation,
+                        block_type,
+                        norm_layer,
+                        activation,
                         group_width,
                         bottleneck_multiplier,
-                        params.se_ratio,
+                        block_params.se_ratio,
                         stage_index=i + 1,
                     ),
                 )
@@ -437,7 +442,7 @@ def __init__(self, params: RegNetParams) -> None:
         self.trunk_output = nn.Sequential(OrderedDict(blocks))
 
         self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(in_features=current_width, out_features=params.num_classes)
+        self.fc = nn.Linear(in_features=current_width, out_features=num_classes)
 
         # Init weights and good to go
         self._init_weights()
@@ -467,8 +472,8 @@ def _init_weights(self) -> None:
                 m.bias.data.zero_()
 
 
-def _regnet(arch: str, params: RegNetParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
-    model = RegNet(params)
+def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
+    model = RegNet(block_params, norm_layer=partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1), **kwargs)
     if pretrained:
         if arch not in model_urls:
             raise ValueError(f"No checkpoint is available for model type {arch}")
@@ -486,7 +491,7 @@ def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
+    params = BlockParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
     return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)
 
 
@@ -499,7 +504,7 @@ def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs)
+    params = BlockParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs)
     return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs)
 
 
@@ -512,7 +517,7 @@ def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs)
+    params = BlockParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs)
     return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs)
 
 
@@ -525,7 +530,7 @@ def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs)
+    params = BlockParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs)
     return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs)
 
 
@@ -538,7 +543,7 @@ def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any)
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs)
+    params = BlockParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs)
     return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs)
 
 
@@ -551,7 +556,7 @@ def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs)
+    params = BlockParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs)
     return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs)
 
 
@@ -564,7 +569,7 @@ def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs)
+    params = BlockParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs)
     return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs)
 
 
@@ -577,8 +582,8 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
-                          group_width=16, use_se=False, **kwargs)
+    params = BlockParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
+                         group_width=16, use_se=False, **kwargs)
     return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
 
 
@@ -591,8 +596,8 @@ def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16,
-                          use_se=False, **kwargs)
+    params = BlockParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16,
+                         use_se=False, **kwargs)
     return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs)
 
 
@@ -605,8 +610,8 @@ def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24,
-                          use_se=False, **kwargs)
+    params = BlockParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24,
+                         use_se=False, **kwargs)
     return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs)
 
 
@@ -619,8 +624,8 @@ def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48,
-                          use_se=False, **kwargs)
+    params = BlockParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48,
+                         use_se=False, **kwargs)
     return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs)
 
 
@@ -633,8 +638,8 @@ def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any)
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120,
-                          use_se=False, **kwargs)
+    params = BlockParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120,
+                         use_se=False, **kwargs)
     return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs)
 
 
@@ -647,8 +652,8 @@ def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128,
-                          use_se=False, **kwargs)
+    params = BlockParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128,
+                         use_se=False, **kwargs)
     return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs)
 
 
@@ -661,8 +666,8 @@ def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = RegNetParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168,
-                          use_se=False, **kwargs)
+    params = BlockParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168,
+                         use_se=False, **kwargs)
     return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs)
 
 # TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF

From 2417685c8e7154e2d27c9286a7bc6d400bf4e34c Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Mon, 20 Sep 2021 22:15:03 +0000
Subject: [PATCH 30/40] use nn.init, replace np with torch

---
 torchvision/models/regnet.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 7e94bf04978..0b49bc17fb8 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -3,7 +3,6 @@
 # https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/models/regnet.py
 
 
-import numpy as np
 import math
 import torch
 
@@ -326,14 +325,14 @@ def get_expanded_params(self):
         STRIDE = 2
 
         # Compute the block widths. Each stage has one unique block width
-        widths_cont = np.arange(self.depth) * self.w_a + self.w_0
-        block_capacity = np.round(np.log(widths_cont / self.w_0) / np.log(self.w_m))
+        widths_cont = torch.arange(self.depth) * self.w_a + self.w_0
+        block_capacity = torch.round(torch.log(widths_cont / self.w_0) / math.log(self.w_m))
         block_widths = (
-            np.round(np.divide(self.w_0 * np.power(self.w_m, block_capacity), QUANT))
+            torch.round(torch.divide(self.w_0 * torch.pow(self.w_m, block_capacity), QUANT))
             * QUANT
-        )
-        num_stages = len(np.unique(block_widths))
-        block_widths = block_widths.astype(int).tolist()
+        ).int()
+        num_stages = len(torch.unique(block_widths))
+        block_widths = block_widths.tolist()
 
         # Convert to per stage parameters
         split_helper = zip(
@@ -345,7 +344,7 @@ def get_expanded_params(self):
         splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
 
         stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
-        stage_depths = np.diff([d for d, t in enumerate(splits) if t]).tolist()
+        stage_depths = torch.diff(torch.Tensor([d for d, t in enumerate(splits) if t])).int().tolist()
 
         strides = [STRIDE] * num_stages
         bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
@@ -445,7 +444,7 @@ def __init__(
         self.fc = nn.Linear(in_features=current_width, out_features=num_classes)
 
         # Init weights and good to go
-        self._init_weights()
+        self.reset_parameters()
 
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
@@ -457,19 +456,19 @@ def forward(self, x: Tensor) -> Tensor:
 
         return x
 
-    def _init_weights(self) -> None:
+    def reset_parameters(self) -> None:
         # Performs ResNet-style weight initialization
         for m in self.modules():
             if isinstance(m, nn.Conv2d):
                 # Note that there is no bias due to BN
                 fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
-                m.weight.data.normal_(mean=0.0, std=math.sqrt(2.0 / fan_out))
+                nn.init.normal_(m.weight, mean=0.0, std=math.sqrt(2.0 / fan_out))
             elif isinstance(m, nn.BatchNorm2d):
-                m.weight.data.fill_(1.0)
-                m.bias.data.zero_()
+                nn.init.ones_(m.weight)
+                nn.init.zeros_(m.bias)
             elif isinstance(m, nn.Linear):
-                m.weight.data.normal_(mean=0.0, std=0.01)
-                m.bias.data.zero_()
+                nn.init.normal_(m.weight, mean=0.0, std=0.01)
+                nn.init.zeros_(m.bias)
 
 
 def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:

From f3b3e9667b073a0071be9f76a655ab49ba18cb9e Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Mon, 20 Sep 2021 22:40:50 +0000
Subject: [PATCH 31/40] update README

---
 references/classification/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/references/classification/README.md b/references/classification/README.md
index 79149758428..5d945d2728d 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -82,8 +82,7 @@ The weights of the B5-B7 variants are ported from Luke Melas' [EfficientNet-PyTo
 
 ### RegNet
 ```
-python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
-    --model regnet_y_400mf --epochs 100
+torchrun --nproc_per_node=8 train.py --model regnet_y_400mf --epochs 100 --batch-size 128
 ```
 
 ## Mixed precision training

From e60e4daad310fb52bdae176327c9b9ebe4c2eb10 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Tue, 21 Sep 2021 05:59:32 +0000
Subject: [PATCH 32/40] construct model with stem, block, classifier instances

---
 torchvision/models/regnet.py | 154 +++++++++++++++++++----------------
 1 file changed, 85 insertions(+), 69 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 0b49bc17fb8..9e2a574356e 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -73,6 +73,20 @@ def __init__(
                          norm_layer=norm_layer, activation_layer=activation_layer)
 
 
+def _make_stem(
+    stem_width: int,
+    norm_layer: Callable[..., nn.Module],
+    activation: Callable[..., nn.Module],
+    stem_type: Callable[..., nn.Module] = SimpleStemIN,
+) -> nn.Module:
+    return stem_type(
+        3,  # width_in
+        stem_width,
+        norm_layer,
+        activation,
+    )
+
+
 class VanillaBlock(nn.Sequential):
     """Vanilla block: [3x3 conv, BN, Relu] x2."""
 
@@ -201,9 +215,6 @@ def __init__(
         )
         self.activation = activation_layer(inplace=True)
 
-        # The projection and transform happen in parallel,
-        # and activation is not counted with respect to depth
-
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
             x = self.bn(self.proj(x)) + self.f(x)
@@ -288,6 +299,7 @@ def __init__(
         bottleneck_multiplier: float = 1.0,
         use_se: bool = True,
         se_ratio: float = 0.25,
+        **kwargs: Any,
     ) -> None:
         if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
             raise ValueError("Invalid RegNet settings")
@@ -377,83 +389,79 @@ def _adjust_widths_groups_compatibilty(
         return stage_widths, group_widths_min
 
 
-class RegNet(nn.Module):
-    def __init__(
-        self,
-        block_params: BlockParams,
-        num_classes: int = 1000,
-        stem_width: int = 32,
-        stem_type: Optional[Callable[..., nn.Module]] = None,
-        block_type: Optional[Callable[..., nn.Module]] = None,
-        norm_layer: Optional[Callable[..., nn.Module]] = None,
-        activation: Optional[Callable[..., nn.Module]] = None,
-    ) -> None:
-        super().__init__()
-
-        if stem_type is None:
-            stem_type = SimpleStemIN
-        if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
-        if block_type is None:
-            block_type = ResBottleneckBlock
-        if activation is None:
-            activation = nn.ReLU
-
-        # Ad hoc stem
-        self.stem = stem_type(
-            3,  # width_in
-            stem_width,
-            norm_layer,
-            activation,
+def _make_blocks(
+    stem_width: int,
+    params: BlockParams,
+    norm_layer: Callable[..., nn.Module],
+    activation: Callable[..., nn.Module],
+    block_type: Callable[..., nn.Module] = ResBottleneckBlock,
+) -> Tuple[nn.Sequential, int]:
+    current_width = stem_width
+
+    blocks = []
+    for i, (
+        width_out,
+        stride,
+        depth,
+        group_width,
+        bottleneck_multiplier,
+    ) in enumerate(params.get_expanded_params()):
+        blocks.append(
+            (
+                f"block{i+1}",
+                AnyStage(
+                    current_width,
+                    width_out,
+                    stride,
+                    depth,
+                    block_type,
+                    norm_layer,
+                    activation,
+                    group_width,
+                    bottleneck_multiplier,
+                    params.se_ratio,
+                    stage_index=i + 1,
+                ),
+            )
         )
 
-        current_width = stem_width
+        current_width = width_out
+    return (nn.Sequential(OrderedDict(blocks)), current_width)
 
-        blocks = []
-        for i, (
-            width_out,
-            stride,
-            depth,
-            group_width,
-            bottleneck_multiplier,
-        ) in enumerate(block_params.get_expanded_params()):
-            blocks.append(
-                (
-                    f"block{i+1}",
-                    AnyStage(
-                        current_width,
-                        width_out,
-                        stride,
-                        depth,
-                        block_type,
-                        norm_layer,
-                        activation,
-                        group_width,
-                        bottleneck_multiplier,
-                        block_params.se_ratio,
-                        stage_index=i + 1,
-                    ),
-                )
-            )
 
-            current_width = width_out
+class Classifier(nn.Module):
+    def __init__(self, in_channels: int, num_classes: int = 1000) -> None:
+        super().__init__()
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(in_features=in_channels, out_features=num_classes)
 
-        self.trunk_output = nn.Sequential(OrderedDict(blocks))
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.avgpool(x)
+        x = x.flatten(start_dim=1)
+        x = self.fc(x)
+        return x
 
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(in_features=current_width, out_features=num_classes)
+
+class RegNet(nn.Module):
+    def __init__(
+        self,
+        stem: nn.Module,
+        blocks: nn.Module,
+        classifier: nn.Module,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+        self.stem = stem
+        self.blocks = blocks
+        self.classifier = classifier
 
         # Init weights and good to go
         self.reset_parameters()
 
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
-        x = self.trunk_output(x)
-
-        x = self.avgpool(x)
-        x = x.flatten(start_dim=1)
-        x = self.fc(x)
-
+        x = self.blocks(x)
+        x = self.classifier(x)
         return x
 
     def reset_parameters(self) -> None:
@@ -472,7 +480,15 @@ def reset_parameters(self) -> None:
 
 
 def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
-    model = RegNet(block_params, norm_layer=partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1), **kwargs)
+    norm_layer = kwargs["norm_layer"] if "norm_layer" in kwargs else partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1)
+    activation = kwargs["activation"] if "activation" in kwargs else nn.ReLU
+    num_classes = kwargs["num_classes"] if "num_classes" in kwargs else 1000
+
+    stem_width = 32
+    stem = _make_stem(stem_width, norm_layer=norm_layer, activation=activation)
+    blocks, out_channels = _make_blocks(stem_width, params=block_params, norm_layer=norm_layer, activation=activation)
+    classifier = Classifier(out_channels, num_classes)
+    model = RegNet(stem, blocks, classifier)
     if pretrained:
         if arch not in model_urls:
             raise ValueError(f"No checkpoint is available for model type {arch}")

From 27da2c735b6338f468204ba76d07daace1fbeba5 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 22 Sep 2021 18:51:58 +0000
Subject: [PATCH 33/40] Revert "construct model with stem, block, classifier
 instances"

This reverts commit 850f5f3ed01a2a9b36fcbf8405afd6e41d2e58ef.
---
 torchvision/models/regnet.py | 154 ++++++++++++++++-------------------
 1 file changed, 69 insertions(+), 85 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 9e2a574356e..0b49bc17fb8 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -73,20 +73,6 @@ def __init__(
                          norm_layer=norm_layer, activation_layer=activation_layer)
 
 
-def _make_stem(
-    stem_width: int,
-    norm_layer: Callable[..., nn.Module],
-    activation: Callable[..., nn.Module],
-    stem_type: Callable[..., nn.Module] = SimpleStemIN,
-) -> nn.Module:
-    return stem_type(
-        3,  # width_in
-        stem_width,
-        norm_layer,
-        activation,
-    )
-
-
 class VanillaBlock(nn.Sequential):
     """Vanilla block: [3x3 conv, BN, Relu] x2."""
 
@@ -215,6 +201,9 @@ def __init__(
         )
         self.activation = activation_layer(inplace=True)
 
+        # The projection and transform happen in parallel,
+        # and activation is not counted with respect to depth
+
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
             x = self.bn(self.proj(x)) + self.f(x)
@@ -299,7 +288,6 @@ def __init__(
         bottleneck_multiplier: float = 1.0,
         use_se: bool = True,
         se_ratio: float = 0.25,
-        **kwargs: Any,
     ) -> None:
         if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
             raise ValueError("Invalid RegNet settings")
@@ -389,79 +377,83 @@ def _adjust_widths_groups_compatibilty(
         return stage_widths, group_widths_min
 
 
-def _make_blocks(
-    stem_width: int,
-    params: BlockParams,
-    norm_layer: Callable[..., nn.Module],
-    activation: Callable[..., nn.Module],
-    block_type: Callable[..., nn.Module] = ResBottleneckBlock,
-) -> Tuple[nn.Sequential, int]:
-    current_width = stem_width
-
-    blocks = []
-    for i, (
-        width_out,
-        stride,
-        depth,
-        group_width,
-        bottleneck_multiplier,
-    ) in enumerate(params.get_expanded_params()):
-        blocks.append(
-            (
-                f"block{i+1}",
-                AnyStage(
-                    current_width,
-                    width_out,
-                    stride,
-                    depth,
-                    block_type,
-                    norm_layer,
-                    activation,
-                    group_width,
-                    bottleneck_multiplier,
-                    params.se_ratio,
-                    stage_index=i + 1,
-                ),
-            )
-        )
+class RegNet(nn.Module):
+    def __init__(
+        self,
+        block_params: BlockParams,
+        num_classes: int = 1000,
+        stem_width: int = 32,
+        stem_type: Optional[Callable[..., nn.Module]] = None,
+        block_type: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        activation: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
 
-        current_width = width_out
-    return (nn.Sequential(OrderedDict(blocks)), current_width)
+        if stem_type is None:
+            stem_type = SimpleStemIN
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if block_type is None:
+            block_type = ResBottleneckBlock
+        if activation is None:
+            activation = nn.ReLU
+
+        # Ad hoc stem
+        self.stem = stem_type(
+            3,  # width_in
+            stem_width,
+            norm_layer,
+            activation,
+        )
 
+        current_width = stem_width
 
-class Classifier(nn.Module):
-    def __init__(self, in_channels: int, num_classes: int = 1000) -> None:
-        super().__init__()
-        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
-        self.fc = nn.Linear(in_features=in_channels, out_features=num_classes)
+        blocks = []
+        for i, (
+            width_out,
+            stride,
+            depth,
+            group_width,
+            bottleneck_multiplier,
+        ) in enumerate(block_params.get_expanded_params()):
+            blocks.append(
+                (
+                    f"block{i+1}",
+                    AnyStage(
+                        current_width,
+                        width_out,
+                        stride,
+                        depth,
+                        block_type,
+                        norm_layer,
+                        activation,
+                        group_width,
+                        bottleneck_multiplier,
+                        block_params.se_ratio,
+                        stage_index=i + 1,
+                    ),
+                )
+            )
 
-    def forward(self, x: Tensor) -> Tensor:
-        x = self.avgpool(x)
-        x = x.flatten(start_dim=1)
-        x = self.fc(x)
-        return x
+            current_width = width_out
 
+        self.trunk_output = nn.Sequential(OrderedDict(blocks))
 
-class RegNet(nn.Module):
-    def __init__(
-        self,
-        stem: nn.Module,
-        blocks: nn.Module,
-        classifier: nn.Module,
-        **kwargs: Any,
-    ) -> None:
-        super().__init__()
-        self.stem = stem
-        self.blocks = blocks
-        self.classifier = classifier
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(in_features=current_width, out_features=num_classes)
 
         # Init weights and good to go
         self.reset_parameters()
 
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
-        x = self.blocks(x)
-        x = self.classifier(x)
+        x = self.trunk_output(x)
+
+        x = self.avgpool(x)
+        x = x.flatten(start_dim=1)
+        x = self.fc(x)
+
         return x
 
     def reset_parameters(self) -> None:
@@ -480,15 +472,7 @@ def reset_parameters(self) -> None:
 
 
 def _regnet(arch: str, block_params: BlockParams, pretrained: bool, progress: bool, **kwargs: Any) -> RegNet:
-    norm_layer = kwargs["norm_layer"] if "norm_layer" in kwargs else partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1)
-    activation = kwargs["activation"] if "activation" in kwargs else nn.ReLU
-    num_classes = kwargs["num_classes"] if "num_classes" in kwargs else 1000
-
-    stem_width = 32
-    stem = _make_stem(stem_width, norm_layer=norm_layer, activation=activation)
-    blocks, out_channels = _make_blocks(stem_width, params=block_params, norm_layer=norm_layer, activation=activation)
-    classifier = Classifier(out_channels, num_classes)
-    model = RegNet(stem, blocks, classifier)
+    model = RegNet(block_params, norm_layer=partial(nn.BatchNorm2d, eps=1e-05, momentum=0.1), **kwargs)
     if pretrained:
         if arch not in model_urls:
             raise ValueError(f"No checkpoint is available for model type {arch}")

From ddf53837825fe02f72610747c0086b74760da4d0 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 22 Sep 2021 21:34:07 +0000
Subject: [PATCH 34/40] remove unused blocks

---
 torchvision/models/efficientnet.py |   2 +
 torchvision/models/regnet.py       | 128 +----------------------------
 2 files changed, 3 insertions(+), 127 deletions(-)

diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index dbfb6bb7dd7..bad5b57b25b 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -37,11 +37,13 @@ def __init__(
         input_channels: int,
         squeeze_channels: int,
         activation: Callable[..., nn.Module] = nn.ReLU,
+        scale_activation: Callable[..., nn.Module] = nn.Sigmoid,
     ) -> None:
         super().__init__()
         self.fc1 = nn.Conv2d(input_channels, squeeze_channels, 1)
         self.fc2 = nn.Conv2d(squeeze_channels, input_channels, 1)
         self.activation = activation()
+        self.scale_activation = scale_activation()
 
     def _scale(self, input: Tensor) -> Tensor:
         scale = F.adaptive_avg_pool2d(input, 1)
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 0b49bc17fb8..eb2ef1bdabe 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -21,44 +21,6 @@
 }
 
 
-class BasicTransform(nn.Sequential):
-    """Basic transformation: [3x3 conv, BN, Relu] x2."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-    ) -> None:
-        super().__init__(OrderedDict(
-            a=nn.Sequential(
-                ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
-                                 norm_layer=norm_layer, activation_layer=activation_layer),
-                nn.Conv2d(width_out, width_out, 3, stride=1, padding=1, bias=False),
-            ),
-            final_bn=norm_layer(width_out),
-        ))
-
-
-class ResStemIN(nn.Sequential):
-    """ResNet stem for ImageNet: 7x7, BN, ReLU, MaxPool."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-    ) -> None:
-        super().__init__(
-            ConvBNActivation(width_in, width_out, kernel_size=7, stride=2,
-                             norm_layer=norm_layer, activation_layer=activation_layer),
-            nn.MaxPool2d(3, stride=2, padding=1),
-        )
-
-
 class SimpleStemIN(ConvBNActivation):
     """Simple stem for ImageNet: 3x3, BN, ReLU."""
 
@@ -73,61 +35,6 @@ def __init__(
                          norm_layer=norm_layer, activation_layer=activation_layer)
 
 
-class VanillaBlock(nn.Sequential):
-    """Vanilla block: [3x3 conv, BN, Relu] x2."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        *args,
-        **kwargs,
-    ) -> None:
-        super().__init__(OrderedDict(
-            a=ConvBNActivation(width_in, width_out, kernel_size=3, stride=stride,
-                               norm_layer=norm_layer, activation_layer=activation_layer),
-            b=ConvBNActivation(width_out, width_out, kernel_size=3, stride=1,
-                               norm_layer=norm_layer, activation_layer=activation_layer),
-        ))
-
-
-class ResBasicBlock(nn.Module):
-    """Residual basic block: x + F(x), F = basic transform."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        *args,
-        **kwargs,
-    ) -> None:
-        super().__init__()
-        self.proj_block = (width_in != width_out) or (stride != 1)
-        if self.proj_block:
-            self.proj = nn.Conv2d(
-                width_in, width_out, 1, stride=stride, padding=0, bias=False
-            )
-            self.bn = norm_layer(width_out)
-        self.f = BasicTransform(
-            width_in, width_out, stride, norm_layer, activation_layer
-        )
-        self.activation = activation_layer(inplace=True)
-
-    def forward(self, x: Tensor) -> Tensor:
-        if self.proj_block:
-            x = self.bn(self.proj(x)) + self.f(x)
-        else:
-            x = x + self.f(x)
-
-        return self.activation(x)
-
-
 class BottleneckTransform(nn.Sequential):
     """Bottleneck transformation: 1x1, 3x3 [+SE], 1x1."""
 
@@ -201,9 +108,6 @@ def __init__(
         )
         self.activation = activation_layer(inplace=True)
 
-        # The projection and transform happen in parallel,
-        # and activation is not counted with respect to depth
-
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
             x = self.bn(self.proj(x)) + self.f(x)
@@ -212,37 +116,6 @@ def forward(self, x: Tensor) -> Tensor:
         return self.activation(x)
 
 
-class ResBottleneckLinearBlock(nn.Module):
-    """Residual linear bottleneck block: x + F(x), F = bottleneck transform."""
-
-    def __init__(
-        self,
-        width_in: int,
-        width_out: int,
-        stride: int,
-        norm_layer: Callable[..., nn.Module],
-        activation_layer: Callable[..., nn.Module],
-        group_width: int = 1,
-        bottleneck_multiplier: float = 4.0,
-        se_ratio: Optional[float] = None,
-    ) -> None:
-        super().__init__()
-        self.has_skip = (width_in == width_out) and (stride == 1)
-        self.f = BottleneckTransform(
-            width_in,
-            width_out,
-            stride,
-            norm_layer,
-            activation_layer,
-            group_width,
-            bottleneck_multiplier,
-            se_ratio,
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        return x + self.f(x) if self.has_skip else self.f(x)
-
-
 class AnyStage(nn.Sequential):
     """AnyNet stage (sequence of blocks w/ the same output shape)."""
 
@@ -288,6 +161,7 @@ def __init__(
         bottleneck_multiplier: float = 1.0,
         use_se: bool = True,
         se_ratio: float = 0.25,
+        **kwargs: Any,
     ) -> None:
         if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
             raise ValueError("Invalid RegNet settings")

From 293073d6d6d40c7403ee2be719159a75f80a272c Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 22 Sep 2021 22:19:49 +0000
Subject: [PATCH 35/40] support scaled model

---
 torchvision/models/regnet.py | 111 +++++++++++++++++++++--------------
 1 file changed, 67 insertions(+), 44 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index eb2ef1bdabe..7b9864b96f9 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -153,28 +153,32 @@ def __init__(
 class BlockParams:
     def __init__(
         self,
+        depths: List[int],
+        widths: List[int],
+        group_widths: List[int],
+        bottleneck_multipliers: List[int],
+        strides: List[int],
+        se_ratio: Optional[float] = None,
+    ) -> None:
+        self.depths = depths
+        self.widths = widths
+        self.group_widths = group_widths
+        self.bottleneck_multipliers = bottleneck_multipliers
+        self.strides = strides
+        self.se_ratio = se_ratio
+
+    @classmethod
+    def from_init_params(
+        cls,
         depth: int,
         w_0: int,
         w_a: float,
         w_m: float,
         group_width: int,
         bottleneck_multiplier: float = 1.0,
-        use_se: bool = True,
-        se_ratio: float = 0.25,
+        se_ratio: Optional[float] = None,
         **kwargs: Any,
-    ) -> None:
-        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
-            raise ValueError("Invalid RegNet settings")
-        self.depth = depth
-        self.w_0 = w_0
-        self.w_a = w_a
-        self.w_m = w_m
-        self.group_width = group_width
-        self.bottleneck_multiplier = bottleneck_multiplier
-        self.use_se = use_se
-        self.se_ratio = se_ratio if use_se else None
-
-    def get_expanded_params(self):
+    ) -> "BlockParams":
         """
         Programatically compute all the per-block settings,
         given the RegNet parameters.
@@ -198,11 +202,13 @@ def get_expanded_params(self):
         QUANT = 8
         STRIDE = 2
 
+        if w_a < 0 or w_0 <= 0 or w_m <= 1 or w_0 % 8 != 0:
+            raise ValueError("Invalid RegNet settings")
         # Compute the block widths. Each stage has one unique block width
-        widths_cont = torch.arange(self.depth) * self.w_a + self.w_0
-        block_capacity = torch.round(torch.log(widths_cont / self.w_0) / math.log(self.w_m))
+        widths_cont = torch.arange(depth) * w_a + w_0
+        block_capacity = torch.round(torch.log(widths_cont / w_0) / math.log(w_m))
         block_widths = (
-            torch.round(torch.divide(self.w_0 * torch.pow(self.w_m, block_capacity), QUANT))
+            torch.round(torch.divide(w_0 * torch.pow(w_m, block_capacity), QUANT))
             * QUANT
         ).int()
         num_stages = len(torch.unique(block_widths))
@@ -221,16 +227,26 @@ def get_expanded_params(self):
         stage_depths = torch.diff(torch.Tensor([d for d, t in enumerate(splits) if t])).int().tolist()
 
         strides = [STRIDE] * num_stages
-        bottleneck_multipliers = [self.bottleneck_multiplier] * num_stages
-        group_widths = [self.group_width] * num_stages
+        bottleneck_multipliers = [bottleneck_multiplier] * num_stages
+        group_widths = [group_width] * num_stages
 
         # Adjust the compatibility of stage widths and group widths
-        stage_widths, group_widths = self._adjust_widths_groups_compatibilty(
+        stage_widths, group_widths = cls._adjust_widths_groups_compatibilty(
             stage_widths, bottleneck_multipliers, group_widths
         )
 
+        return cls(
+            depths=stage_depths,
+            widths=stage_widths,
+            group_widths=group_widths,
+            bottleneck_multipliers=bottleneck_multipliers,
+            strides=strides,
+            se_ratio=se_ratio,
+        )
+
+    def _get_expanded_params(self):
         return zip(
-            stage_widths, strides, stage_depths, group_widths, bottleneck_multipliers
+            self.widths, self.strides, self.depths, self.group_widths, self.bottleneck_multipliers
         )
 
     @staticmethod
@@ -290,7 +306,7 @@ def __init__(
             depth,
             group_width,
             bottleneck_multiplier,
-        ) in enumerate(block_params.get_expanded_params()):
+        ) in enumerate(block_params._get_expanded_params()):
             blocks.append(
                 (
                     f"block{i+1}",
@@ -364,7 +380,8 @@ def regnet_y_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=16, w_0=48, w_a=27.89, w_m=2.09, group_width=8, **kwargs)
+    params = BlockParams.from_init_params(depth=16, w_0=48, w_a=27.89, w_m=2.09,
+                                          group_width=8, se_ratio=0.25, **kwargs)
     return _regnet("regnet_y_400mf", params, pretrained, progress, **kwargs)
 
 
@@ -377,7 +394,8 @@ def regnet_y_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=14, w_0=56, w_a=38.84, w_m=2.4, group_width=16, **kwargs)
+    params = BlockParams.from_init_params(depth=14, w_0=56, w_a=38.84, w_m=2.4,
+                                          group_width=16, se_ratio=0.25, **kwargs)
     return _regnet("regnet_y_800mf", params, pretrained, progress, **kwargs)
 
 
@@ -390,7 +408,8 @@ def regnet_y_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=27, w_0=48, w_a=20.71, w_m=2.65, group_width=24, **kwargs)
+    params = BlockParams.from_init_params(depth=27, w_0=48, w_a=20.71, w_m=2.65,
+                                          group_width=24, se_ratio=0.25, **kwargs)
     return _regnet("regnet_y_1_6gf", params, pretrained, progress, **kwargs)
 
 
@@ -403,7 +422,8 @@ def regnet_y_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=21, w_0=80, w_a=42.63, w_m=2.66, group_width=24, **kwargs)
+    params = BlockParams.from_init_params(depth=21, w_0=80, w_a=42.63, w_m=2.66,
+                                          group_width=24, se_ratio=0.25, **kwargs)
     return _regnet("regnet_y_3_2gf", params, pretrained, progress, **kwargs)
 
 
@@ -416,7 +436,8 @@ def regnet_y_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any)
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=17, w_0=192, w_a=76.82, w_m=2.19, group_width=56, **kwargs)
+    params = BlockParams.from_init_params(depth=17, w_0=192, w_a=76.82, w_m=2.19,
+                                          group_width=56, se_ratio=0.25, **kwargs)
     return _regnet("regnet_y_8gf", params, pretrained, progress, **kwargs)
 
 
@@ -429,7 +450,8 @@ def regnet_y_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=18, w_0=200, w_a=106.23, w_m=2.48, group_width=112, **kwargs)
+    params = BlockParams.from_init_params(depth=18, w_0=200, w_a=106.23, w_m=2.48,
+                                          group_width=112, se_ratio=0.25, **kwargs)
     return _regnet("regnet_y_16gf", params, pretrained, progress, **kwargs)
 
 
@@ -442,7 +464,8 @@ def regnet_y_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=20, w_0=232, w_a=115.89, w_m=2.53, group_width=232, **kwargs)
+    params = BlockParams.from_init_params(depth=20, w_0=232, w_a=115.89, w_m=2.53,
+                                          group_width=232, se_ratio=0.25, **kwargs)
     return _regnet("regnet_y_32gf", params, pretrained, progress, **kwargs)
 
 
@@ -455,8 +478,8 @@ def regnet_x_400mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=22, w_0=24, w_a=24.48, w_m=2.54,
-                         group_width=16, use_se=False, **kwargs)
+    params = BlockParams.from_init_params(depth=22, w_0=24, w_a=24.48, w_m=2.54,
+                                          group_width=16, **kwargs)
     return _regnet("regnet_x_400mf", params, pretrained, progress, **kwargs)
 
 
@@ -469,8 +492,8 @@ def regnet_x_800mf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=16, w_0=56, w_a=35.73, w_m=2.28, group_width=16,
-                         use_se=False, **kwargs)
+    params = BlockParams.from_init_params(depth=16, w_0=56, w_a=35.73, w_m=2.28,
+                                          group_width=16, **kwargs)
     return _regnet("regnet_x_800mf", params, pretrained, progress, **kwargs)
 
 
@@ -483,8 +506,8 @@ def regnet_x_1_6gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=18, w_0=80, w_a=34.01, w_m=2.25, group_width=24,
-                         use_se=False, **kwargs)
+    params = BlockParams.from_init_params(depth=18, w_0=80, w_a=34.01, w_m=2.25,
+                                          group_width=24, **kwargs)
     return _regnet("regnet_x_1_6gf", params, pretrained, progress, **kwargs)
 
 
@@ -497,8 +520,8 @@ def regnet_x_3_2gf(pretrained: bool = False, progress: bool = True, **kwargs: An
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=25, w_0=88, w_a=26.31, w_m=2.25, group_width=48,
-                         use_se=False, **kwargs)
+    params = BlockParams.from_init_params(depth=25, w_0=88, w_a=26.31, w_m=2.25,
+                                          group_width=48, **kwargs)
     return _regnet("regnet_x_3_2gf", params, pretrained, progress, **kwargs)
 
 
@@ -511,8 +534,8 @@ def regnet_x_8gf(pretrained: bool = False, progress: bool = True, **kwargs: Any)
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=23, w_0=80, w_a=49.56, w_m=2.88, group_width=120,
-                         use_se=False, **kwargs)
+    params = BlockParams.from_init_params(depth=23, w_0=80, w_a=49.56, w_m=2.88,
+                                          group_width=120, **kwargs)
     return _regnet("regnet_x_8gf", params, pretrained, progress, **kwargs)
 
 
@@ -525,8 +548,8 @@ def regnet_x_16gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=22, w_0=216, w_a=55.59, w_m=2.1, group_width=128,
-                         use_se=False, **kwargs)
+    params = BlockParams.from_init_params(depth=22, w_0=216, w_a=55.59, w_m=2.1,
+                                          group_width=128, **kwargs)
     return _regnet("regnet_x_16gf", params, pretrained, progress, **kwargs)
 
 
@@ -539,8 +562,8 @@ def regnet_x_32gf(pretrained: bool = False, progress: bool = True, **kwargs: Any
         pretrained (bool): If True, returns a model pre-trained on ImageNet
         progress (bool): If True, displays a progress bar of the download to stderr
     """
-    params = BlockParams(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168,
-                         use_se=False, **kwargs)
+    params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0,
+                                          group_width=168, **kwargs)
     return _regnet("regnet_x_32gf", params, pretrained, progress, **kwargs)
 
 # TODO(kazhang): Add RegNetZ_500MF and RegNetZ_4GF

From 3957d5dd5797849ce8a40d7daa2aa1db55795664 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 22 Sep 2021 22:37:08 +0000
Subject: [PATCH 36/40] fuse into ConvBNActivation

---
 torchvision/models/regnet.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 7b9864b96f9..9a5674f6abe 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -68,8 +68,8 @@ def __init__(
                 activation=activation_layer,
             )
 
-        layers["c"] = nn.Conv2d(w_b, width_out, 1, stride=1, padding=0, bias=False)
-        layers["final_bn"] = norm_layer(width_out)
+        layers["c"] = ConvBNActivation(w_b, width_out, kernel_size=1, stride=1,
+                                       norm_layer=norm_layer, activation_layer=nn.Identity)
         super().__init__(layers)
 
 
@@ -92,10 +92,8 @@ def __init__(
         # Use skip connection with projection if shape changes
         self.proj_block = (width_in != width_out) or (stride != 1)
         if self.proj_block:
-            self.proj = nn.Conv2d(
-                width_in, width_out, 1, stride=stride, padding=0, bias=False
-            )
-            self.bn = norm_layer(width_out)
+            self.proj = ConvBNActivation(width_in, width_out, kernel_size=1,
+                                         stride=stride, norm_layer=norm_layer, activation_layer=nn.Identity)
         self.f = BottleneckTransform(
             width_in,
             width_out,
@@ -110,7 +108,7 @@ def __init__(
 
     def forward(self, x: Tensor) -> Tensor:
         if self.proj_block:
-            x = self.bn(self.proj(x)) + self.f(x)
+            x = self.proj(x) + self.f(x)
         else:
             x = x + self.f(x)
         return self.activation(x)

From 208f045fa6fb6301dc76ec6c2d0399c7782c98c2 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 22 Sep 2021 22:44:24 +0000
Subject: [PATCH 37/40] make reset_parameters private

---
 torchvision/models/regnet.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 9a5674f6abe..d18057641a1 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -332,7 +332,7 @@ def __init__(
         self.fc = nn.Linear(in_features=current_width, out_features=num_classes)
 
         # Init weights and good to go
-        self.reset_parameters()
+        self._reset_parameters()
 
     def forward(self, x: Tensor) -> Tensor:
         x = self.stem(x)
@@ -344,7 +344,7 @@ def forward(self, x: Tensor) -> Tensor:
 
         return x
 
-    def reset_parameters(self) -> None:
+    def _reset_parameters(self) -> None:
         # Performs ResNet-style weight initialization
         for m in self.modules():
             if isinstance(m, nn.Conv2d):

From f78a27fb2b1a70fdbf3e9a74171380d945346bc5 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 22 Sep 2021 23:56:16 +0000
Subject: [PATCH 38/40] fix type errors

---
 torchvision/models/regnet.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index d18057641a1..f4fd2702f93 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -49,7 +49,7 @@ def __init__(
         bottleneck_multiplier: float,
         se_ratio: Optional[float],
     ) -> None:
-        layers = OrderedDict()
+        layers: OrderedDict[str, nn.Module] = OrderedDict()
         w_b = int(round(width_out * bottleneck_multiplier))
         g = w_b // group_width
 
@@ -154,7 +154,7 @@ def __init__(
         depths: List[int],
         widths: List[int],
         group_widths: List[int],
-        bottleneck_multipliers: List[int],
+        bottleneck_multipliers: List[float],
         strides: List[int],
         se_ratio: Optional[float] = None,
     ) -> None:
@@ -208,9 +208,8 @@ def from_init_params(
         block_widths = (
             torch.round(torch.divide(w_0 * torch.pow(w_m, block_capacity), QUANT))
             * QUANT
-        ).int()
-        num_stages = len(torch.unique(block_widths))
-        block_widths = block_widths.tolist()
+        ).int().tolist()
+        num_stages = len(set(block_widths))
 
         # Convert to per stage parameters
         split_helper = zip(
@@ -222,7 +221,7 @@ def from_init_params(
         splits = [w != wp or r != rp for w, wp, r, rp in split_helper]
 
         stage_widths = [w for w, t in zip(block_widths, splits[:-1]) if t]
-        stage_depths = torch.diff(torch.Tensor([d for d, t in enumerate(splits) if t])).int().tolist()
+        stage_depths = torch.diff(torch.tensor([d for d, t in enumerate(splits) if t])).int().tolist()
 
         strides = [STRIDE] * num_stages
         bottleneck_multipliers = [bottleneck_multiplier] * num_stages

From f59ea8c9b262290c6847a2da33d23b7c2e40b9c0 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Thu, 23 Sep 2021 05:13:47 +0000
Subject: [PATCH 39/40] fix for unit test

---
 torchvision/models/regnet.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index f4fd2702f93..f18055f80b4 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -15,6 +15,13 @@
 from torchvision.models.mobilenetv2 import ConvBNActivation, _make_divisible
 from torchvision.models.efficientnet import SqueezeExcitation
 
+
+__all__ = ["RegNet", "regnet_y_400mf", "regnet_y_800mf", "regnet_y_1_6gf",
+           "regnet_y_3_2gf", "regnet_y_8gf", "regnet_y_16gf", "regnet_y_32gf",
+           "regnet_x_400mf", "regnet_x_800mf", "regnet_x_1_6gf", "regnet_x_3_2gf",
+           "regnet_x_8gf", "regnet_x_16gf", "regnet_x_32gf"]
+
+
 model_urls = {
     # TODO(kazhang): add pretrained weights
     "regnet_y_400m": "",
@@ -90,8 +97,9 @@ def __init__(
         super().__init__()
 
         # Use skip connection with projection if shape changes
-        self.proj_block = (width_in != width_out) or (stride != 1)
-        if self.proj_block:
+        self.proj = None
+        should_proj = (width_in != width_out) or (stride != 1)
+        if should_proj:
             self.proj = ConvBNActivation(width_in, width_out, kernel_size=1,
                                          stride=stride, norm_layer=norm_layer, activation_layer=nn.Identity)
         self.f = BottleneckTransform(
@@ -107,7 +115,7 @@ def __init__(
         self.activation = activation_layer(inplace=True)
 
     def forward(self, x: Tensor) -> Tensor:
-        if self.proj_block:
+        if self.proj is not None:
             x = self.proj(x) + self.f(x)
         else:
             x = x + self.f(x)

From b0325b6db1cf69011def768aa3f86229c7dddf31 Mon Sep 17 00:00:00 2001
From: Kai Zhang <kaizh@fb.com>
Date: Wed, 29 Sep 2021 00:06:05 +0000
Subject: [PATCH 40/40] add pretrained weights for 6 variant models, update
 docs

---
 docs/source/models.rst              | 16 +++++++---------
 references/classification/README.md | 26 +++++++++++++++++++++++++-
 torchvision/models/regnet.py        |  8 ++++++--
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/docs/source/models.rst b/docs/source/models.rst
index be2a007d9ae..ef9c326ade4 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -111,18 +111,10 @@ These can be constructed by passing ``pretrained=True``:
     efficientnet_b7 = models.efficientnet_b7(pretrained=True)
     regnet_y_400mf = models.regnet_y_400mf(pretrained=True)
     regnet_y_800mf = models.regnet_y_800mf(pretrained=True)
-    regnet_y_1_6gf = models.regnet_y_1_6gf(pretrained=True)
-    regnet_y_3_2gf = models.regnet_y_3_2gf(pretrained=True)
     regnet_y_8gf = models.regnet_y_8gf(pretrained=True)
-    regnet_y_16gf = models.regnet_y_16gf(pretrained=True)
-    regnet_y_32gf = models.regnet_y_32gf(pretrained=True)
     regnet_x_400mf = models.regnet_x_400mf(pretrained=True)
     regnet_x_800mf = models.regnet_x_800mf(pretrained=True)
-    regnet_x_1_6gf = models.regnet_x_1_6gf(pretrained=True)
-    regnet_x_3_2gf = models.regnet_x_3_2gf(pretrained=True)
     regnet_x_8gf = models.regnet_x_8gf(pretrained=True)
-    regnet_x_16gf = models.regnet_x_16gf(pretrained=True)
-    regnet_x_32gf = models.regnet_x_32gf(pretrained=True)
 
 Instancing a pre-trained model will download its weights to a cache directory.
 This directory can be set using the `TORCH_MODEL_ZOO` environment variable. See
@@ -217,6 +209,12 @@ EfficientNet-B4                   83.384          96.594
 EfficientNet-B5                   83.444          96.628
 EfficientNet-B6                   84.008          96.916
 EfficientNet-B7                   84.122          96.908
+regnet_x_400mf                    72.834          90.950     
+regnet_x_800mf                    75.190          92.418
+regnet_x_8gf                      79.324          94.694
+regnet_y_400mf                    74.024          91.680
+regnet_y_800mf                    76.420          93.136
+regnet_y_8gf                      79.966          95.100
 ================================  =============   =============
 
 
@@ -347,7 +345,7 @@ EfficientNet
 .. autofunction:: efficientnet_b6
 .. autofunction:: efficientnet_b7
 
-EfficientNet
+RegNet
 ------------
 
 .. autofunction:: regnet_y_400mf
diff --git a/references/classification/README.md b/references/classification/README.md
index 5d945d2728d..cc328f0f259 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -81,9 +81,33 @@ The weights of the B5-B7 variants are ported from Luke Melas' [EfficientNet-PyTo
 
 
 ### RegNet
+
+#### Small models
+```
+torchrun --nproc_per_node=8 train.py\
+     --model $MODEL --epochs 100 --batch-size 128 --wd 0.00005 --lr=0.8\
+     --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\
+     --lr-warmup-epochs=5 --lr-warmup-decay=0.1
+```
+Here `$MODEL` is one of `regnet_x_400mf`, `regnet_x_800mf`, `regnet_x_1_6gf`, `regnet_y_400mf`, `regnet_y_800mf` and `regnet_y_1_6gf`. Please note we used learning rate 0.4 for `regent_y_400mf` to get the same Acc@1 as [the paper)(https://arxiv.org/abs/2003.13678).
+
+### Medium models
 ```
-torchrun --nproc_per_node=8 train.py --model regnet_y_400mf --epochs 100 --batch-size 128
+torchrun --nproc_per_node=8 train.py\
+     --model $MODEL --epochs 100 --batch-size 64 --wd 0.00005 --lr=0.4\
+     --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\
+     --lr-warmup-epochs=5 --lr-warmup-decay=0.1
+```
+Here `$MODEL` is one of `regnet_x_3_2gf`, `regnet_x_8gf`, `regnet_x_16gf`, `regnet_y_3_2gf` and `regnet_y_8gf`.
+
+### Large models
+```
+torchrun --nproc_per_node=8 train.py\
+     --model $MODEL --epochs 100 --batch-size 32 --wd 0.00005 --lr=0.2\
+     --lr-scheduler=cosineannealinglr --lr-warmup-method=linear\
+     --lr-warmup-epochs=5 --lr-warmup-decay=0.1
 ```
+Here `$MODEL` is one of `regnet_x_32gf`, `regnet_y_16gf` and `regnet_y_32gf`.
 
 ## Mixed precision training
 Automatic Mixed Precision (AMP) training on GPU for Pytorch can be enabled with the [NVIDIA Apex extension](https://github.com/NVIDIA/apex).
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index f18055f80b4..bbab59c4074 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -23,8 +23,12 @@
 
 
 model_urls = {
-    # TODO(kazhang): add pretrained weights
-    "regnet_y_400m": "",
+    "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-540e987b.pth",
+    "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
+    "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-49ff86b5.pth",
+    "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
+    "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-5cb79b7e.pth",
+    "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-4c4e575e.pth",
 }