From 3ac06adf29f7bfc0a5d06d91246375f1b052a036 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Fri, 11 Nov 2022 13:11:38 +0000
Subject: [PATCH 01/15] Clean up `_get_inverse_affine_matrix` and
 `_compute_affine_output_size`

---
 .../transforms/functional/_geometry.py        | 109 ++++++++++++++++--
 1 file changed, 102 insertions(+), 7 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index ce97ce0575d..cb08a02aad0 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1,3 +1,4 @@
+import math
 import numbers
 import warnings
 from typing import List, Optional, Sequence, Tuple, Union
@@ -10,7 +11,6 @@
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
 from torchvision.transforms.functional import (
     _compute_resized_output_size as __compute_resized_output_size,
-    _get_inverse_affine_matrix,
     _get_perspective_coeffs,
     InterpolationMode,
     pil_modes_mapping,
@@ -272,6 +272,102 @@ def _affine_parse_args(
     return angle, translate, shear, center
 
 
+def _get_inverse_affine_matrix(
+    center: List[float], angle: float, translate: List[float], scale: float, shear: List[float], inverted: bool = True
+) -> List[float]:
+    # Helper method to compute inverse matrix for affine transformation
+
+    # Pillow requires inverse affine transformation matrix:
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    #
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RotateScaleShear is rotation with scale and shear matrix
+    #
+    #       RotateScaleShear(a, s, (sx, sy)) =
+    #       = R(a) * S(s) * SHy(sy) * SHx(sx)
+    #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
+    #         [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ 0                    , 0                                      , 1 ]
+    # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+    # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+    #          [0, 1      ]              [-tan(s), 1]
+    #
+    # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    cx, cy = center
+    tx, ty = translate
+
+    # Cached results
+    cossy = math.cos(sy)
+    tansx = math.tan(sx)
+    rot_sy = rot - sy
+    cx_plus_tx = cx + tx
+    cy_plus_ty = cy + ty
+
+    # RSS without scaling
+    a = math.cos(rot_sy) / cossy
+    b = -math.sin(rot) - a * tansx
+    c = math.sin(rot_sy) / cossy
+    d = math.cos(rot) - c * tansx
+
+    if inverted:
+        # Inverted rotation matrix with scale and shear
+        # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+        matrix = [d / scale, -b / scale, 0.0, -c / scale, a / scale, 0.0]
+        # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+        # and then apply center translation: C * RSS^-1 * C^-1 * T^-1
+        matrix[2] += cx - matrix[0] * cx_plus_tx - matrix[1] * cy_plus_ty
+        matrix[5] += cy - matrix[3] * cx_plus_tx - matrix[4] * cy_plus_ty
+    else:
+        matrix = [a * scale, b * scale, 0.0, c * scale, d * scale, 0.0]
+        # Apply inverse of center translation: RSS * C^-1
+        # and then apply translation and center : T * C * RSS * C^-1
+        matrix[2] += cx_plus_tx - matrix[0] * cx - matrix[1] * cy
+        matrix[5] += cy_plus_ty - matrix[3] * cx - matrix[4] * cy
+
+    return matrix
+
+
+def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    half_w = 0.5 * w
+    half_h = 0.5 * h
+    pts = torch.tensor(
+        [
+            [-half_w, -half_h, 1.0],
+            [-half_w, half_h, 1.0],
+            [half_w, half_h, 1.0],
+            [half_w, -half_h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, max_vals = new_pts.aminmax(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    halfs = torch.tensor((half_w, half_h))
+    min_vals.add_(halfs)
+    max_vals.add_(halfs)
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    inv_tol = 1.0 / tol
+    cmax = max_vals.mul_(inv_tol).trunc_().mul_(tol).ceil_()
+    cmin = min_vals.mul_(inv_tol).trunc_().mul_(tol).floor_()
+    size = cmax.sub_(cmin)
+    return int(size[0]), int(size[1])  # w, h
+
+
 def affine_image_tensor(
     image: torch.Tensor,
     angle: Union[int, float],
@@ -395,7 +491,7 @@ def _affine_bounding_box_xyxy(
         out_bboxes.sub_(tr.repeat((1, 2)))
         # Estimate meta-data for image with inverted=True and with center=[0,0]
         affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear)
-        new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height)
+        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
         spatial_size = (new_height, new_width)
 
     return out_bboxes.to(bounding_box.dtype), spatial_size
@@ -552,7 +648,7 @@ def rotate_image_tensor(
         )
         new_height, new_width = image.shape[-2:]
     else:
-        new_width, new_height = _FT._compute_affine_output_size(matrix, width, height) if expand else (width, height)
+        new_width, new_height = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
 
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
@@ -917,7 +1013,6 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
     # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
     # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
     #
-    # TODO: should we define them transposed?
     theta1 = torch.tensor(
         [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
     )
@@ -932,8 +1027,9 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype,
     base_grid[..., 2].fill_(1)
 
     rescaled_theta1 = theta1.transpose(1, 2).div_(torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device))
-    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
-    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
+    shape = (1, oh * ow, 3)
+    output_grid1 = base_grid.view(shape).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(shape).bmm(theta2.transpose(1, 2))
 
     output_grid = output_grid1.div_(output_grid2).sub_(1.0)
     return output_grid.view(1, oh, ow, 2)
@@ -1059,7 +1155,6 @@ def perspective_bounding_box(
         (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
     ]
 
-    # TODO: should we define them transposed?
     theta1 = torch.tensor(
         [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
         dtype=dtype,

From 62deb434a83194f0cff72b0684d50c8d500ad02c Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Fri, 11 Nov 2022 17:15:51 +0000
Subject: [PATCH 02/15] Optimize `_apply_grid_transform`

---
 .../transforms/functional/_geometry.py        | 61 +++++++++++++++++--
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index cb08a02aad0..57dcdc56825 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -5,7 +5,7 @@
 
 import PIL.Image
 import torch
-from torch.nn.functional import interpolate, pad as torch_pad
+from torch.nn.functional import interpolate, grid_sample, pad as torch_pad
 
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
@@ -368,6 +368,39 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in
     return int(size[0]), int(size[1])  # w, h
 
 
+def _apply_grid_transform(
+    float_img: torch.Tensor, grid:  torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
+) ->  torch.Tensor:
+
+    shape = float_img.shape
+    if shape[0] > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(shape[0], -1, -1, -1)
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones((shape[0], 1, shape[2], shape[3]), dtype=float_img.dtype, device=float_img.device)
+        float_img = torch.cat((float_img, mask), dim=1)
+
+    float_img = grid_sample(float_img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        float_img, mask = torch.tensor_split(float_img, indices=(-1,), dim=-3)
+        mask = mask.expand_as(float_img)
+        fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)]
+        fill_img = torch.tensor(fill_list, dtype=float_img.dtype, device=float_img.device).view(1, -1, 1, 1)
+        if mode == "nearest":
+            bool_mask = mask < 0.5
+            float_img[bool_mask] = fill_img.expand_as(float_img)[bool_mask]
+        else:  # 'bilinear'
+            # The following is mathematically equivalent to:
+            # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill
+            float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img)
+
+    return float_img
+
+
 def affine_image_tensor(
     image: torch.Tensor,
     angle: Union[int, float],
@@ -1065,10 +1098,15 @@ def perspective_image_tensor(
         return image
 
     shape = image.shape
+    ndim = image.ndim
+    fp = torch.is_floating_point(image)
 
-    if image.ndim > 4:
+    if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
     else:
         needs_unsquash = False
 
@@ -1082,9 +1120,12 @@ def perspective_image_tensor(
     )
 
     ow, oh = image.shape[-1], image.shape[-2]
-    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    dtype = image.dtype if fp else torch.float32
     grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
-    output = _FT._apply_grid_transform(image, grid, interpolation.value, fill=fill)
+    output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
+
+    if not fp:
+        output = output.to(image.dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1261,17 +1302,25 @@ def elastic_image_tensor(
         return image
 
     shape = image.shape
+    ndim = image.ndim
     device = image.device
+    fp = torch.is_floating_point(image)
 
-    if image.ndim > 4:
+    if ndim > 4:
         image = image.reshape((-1,) + shape[-3:])
         needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
     else:
         needs_unsquash = False
 
     image_height, image_width = shape[-2:]
     grid = _create_identity_grid((image_height, image_width), device=device).add_(displacement.to(device))
-    output = _FT._apply_grid_transform(image, grid, interpolation.value, fill)
+    output = _apply_grid_transform(image if fp else image.to(torch.float32), grid, interpolation.value, fill=fill)
+
+    if not fp:
+        output = output.to(image.dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)

From 2f0d7638f6a9c5114f3ff632a83a19823c862488 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Fri, 11 Nov 2022 17:27:12 +0000
Subject: [PATCH 03/15] Cleanup `_assert_grid_transform_inputs`

---
 .../transforms/functional/_geometry.py        | 35 ++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 57dcdc56825..7de6d8dccb1 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -401,6 +401,39 @@ def _apply_grid_transform(
     return float_img
 
 
+def _assert_grid_transform_inputs(
+    img: torch.Tensor,
+    matrix: Optional[List[float]],
+    interpolation: str,
+    fill: Optional[Union[int, float, List[float]]],
+    supported_interpolation_modes: List[str],
+    coeffs: Optional[List[float]] = None,
+) -> None:
+    if matrix is not None:
+        if not isinstance(matrix, list):
+            raise TypeError("Argument matrix should be a list")
+        elif len(matrix) != 6:
+            raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None:
+        if isinstance(fill, (tuple, list)):
+            length = len(fill)
+            num_channels = img.shape[-3]
+            if length > 1 and length != num_channels:
+                raise ValueError(
+                    "The number of elements in 'fill' cannot broadcast to match the number of "
+                    f"channels of the image ({length} != {num_channels})"
+                )
+        elif not isinstance(fill, (int, float)):
+            raise ValueError("Argument fill should be either int, float, tuple or list")
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
 def affine_image_tensor(
     image: torch.Tensor,
     angle: Union[int, float],
@@ -1110,7 +1143,7 @@ def perspective_image_tensor(
     else:
         needs_unsquash = False
 
-    _FT._assert_grid_transform_inputs(
+    _assert_grid_transform_inputs(
         image,
         matrix=None,
         interpolation=interpolation.value,

From dca1923ac0cd96f045f2764e6697510a1d5c5ea9 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Fri, 11 Nov 2022 18:20:22 +0000
Subject: [PATCH 04/15] Fix bugs on `_pad_with_scalar_fill` & `crop_mask` and
 port `crop_image_tensor`

---
 .../transforms/functional/_geometry.py        | 96 ++++++++++++-------
 1 file changed, 61 insertions(+), 35 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 7de6d8dccb1..08ee30ddd88 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -889,39 +889,38 @@ def _pad_with_scalar_fill(
     shape = image.shape
     num_channels, height, width = shape[-3:]
 
-    if image.numel() > 0:
-        image = image.reshape(-1, num_channels, height, width)
-
-        if padding_mode == "edge":
-            # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
-            # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
-            # name.
-            padding_mode = "replicate"
-
-        if padding_mode == "constant":
-            image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
-        elif padding_mode in ("reflect", "replicate"):
-            # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
-            # TODO: See https://github.com/pytorch/pytorch/issues/40763
-            dtype = image.dtype
-            if not image.is_floating_point():
-                needs_cast = True
-                image = image.to(torch.float32)
-            else:
-                needs_cast = False
-
-            image = torch_pad(image, torch_padding, mode=padding_mode)
-
-            if needs_cast:
-                image = image.to(dtype)
-        else:  # padding_mode == "symmetric"
-            image = _FT._pad_symmetric(image, torch_padding)
+    batch_size = 1
+    for s in shape[:-3]:
+        batch_size *= s
+
+    image = image.reshape(batch_size, num_channels, height, width)
+
+    if padding_mode == "edge":
+        # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
+        # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
+        # name.
+        padding_mode = "replicate"
+
+    if padding_mode == "constant":
+        image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
+    elif padding_mode in ("reflect", "replicate"):
+        # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
+        # TODO: See https://github.com/pytorch/pytorch/issues/40763
+        dtype = image.dtype
+        if not image.is_floating_point():
+            needs_cast = True
+            image = image.to(torch.float32)
+        else:
+            needs_cast = False
 
-        new_height, new_width = image.shape[-2:]
-    else:
-        left, right, top, bottom = torch_padding
-        new_height = height + top + bottom
-        new_width = width + left + right
+        image = torch_pad(image, torch_padding, mode=padding_mode)
+
+        if needs_cast:
+            image = image.to(dtype)
+    else:  # padding_mode == "symmetric"
+        image = _FT._pad_symmetric(image, torch_padding)
+
+    new_height, new_width = image.shape[-2:]
 
     return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
@@ -1030,7 +1029,23 @@ def pad(
         return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode)
 
 
-crop_image_tensor = _FT.crop
+def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    h, w = image.shape[-2:]
+
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        padding_ltrb = [
+            max(min(right, 0) - left, 0),
+            max(min(bottom, 0) - top, 0),
+            max(right - max(w, left), 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return pad_image_tensor(image[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb)
+    return image[..., top:bottom, left:right]
+
+
 crop_image_pil = _FP.crop
 
 
@@ -1055,7 +1070,18 @@ def crop_bounding_box(
 
 
 def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
-    return crop_image_tensor(mask, top, left, height, width)
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = crop_image_tensor(mask, top, left, height, width)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
 
 
 def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
@@ -1152,7 +1178,7 @@ def perspective_image_tensor(
         coeffs=perspective_coeffs,
     )
 
-    ow, oh = image.shape[-1], image.shape[-2]
+    oh, ow = image.shape[-2:]
     dtype = image.dtype if fp else torch.float32
     grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
     output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)

From b5548ecd13b5c9fa2788f52e3d68fa6e5d1a9e8b Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Fri, 11 Nov 2022 18:39:52 +0000
Subject: [PATCH 05/15] Call directly `_pad_with_scalar_fill`

---
 torchvision/prototype/transforms/functional/_geometry.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 08ee30ddd88..89c243ce798 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1036,13 +1036,14 @@ def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, wid
     bottom = top + height
 
     if left < 0 or top < 0 or right > w or bottom > h:
-        padding_ltrb = [
+        image = image[..., max(top, 0) : bottom, max(left, 0) : right]
+        torch_padding = [
             max(min(right, 0) - left, 0),
-            max(min(bottom, 0) - top, 0),
             max(right - max(w, left), 0),
+            max(min(bottom, 0) - top, 0),
             max(bottom - max(h, top), 0),
         ]
-        return pad_image_tensor(image[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb)
+        return _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
     return image[..., top:bottom, left:right]
 
 

From 709b34a49ed607e8b84c5667ec4d9cc92a2fdacc Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Fri, 11 Nov 2022 18:50:33 +0000
Subject: [PATCH 06/15] Fix linter

---
 torchvision/prototype/transforms/functional/_geometry.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index 89c243ce798..d92a8f6869c 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -5,7 +5,7 @@
 
 import PIL.Image
 import torch
-from torch.nn.functional import interpolate, grid_sample, pad as torch_pad
+from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
 
 from torchvision.prototype import features
 from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT
@@ -369,8 +369,8 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in
 
 
 def _apply_grid_transform(
-    float_img: torch.Tensor, grid:  torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
-) ->  torch.Tensor:
+    float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
+) -> torch.Tensor:
 
     shape = float_img.shape
     if shape[0] > 1:

From b9a6e74dd9bb68aa5059a88055f124c68c51b040 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Fri, 11 Nov 2022 18:56:56 +0000
Subject: [PATCH 07/15] Clean up `center_crop_image_tensor`

---
 torchvision/prototype/transforms/functional/_geometry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index d92a8f6869c..b44a926c3ae 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -1538,7 +1538,7 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor
 
     if crop_height > image_height or crop_width > image_width:
         padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
-        image = _FT.torch_pad(image, _FT._parse_pad_padding(padding_ltrb), value=0.0)
+        image = torch_pad(image, _parse_pad_padding(padding_ltrb), value=0.0)
 
         image_height, image_width = image.shape[-2:]
         if crop_width == image_width and crop_height == image_height:

From 62b9d47be6582114101617179ee43bd8a1fa4647 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 10:36:46 +0000
Subject: [PATCH 08/15] Fix comments.

---
 torchvision/prototype/transforms/functional/_geometry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index b44a926c3ae..b02ecb8b16b 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -287,7 +287,7 @@ def _get_inverse_affine_matrix(
     #       RotateScaleShear(a, s, (sx, sy)) =
     #       = R(a) * S(s) * SHy(sy) * SHx(sx)
     #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
-    #         [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
     #         [ 0                    , 0                                      , 1 ]
     # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
     # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]

From b3a0bb180f9febc5a7db85c2b4265791c2515060 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 12:19:41 +0000
Subject: [PATCH 09/15] Fixing rounding issues.

---
 torchvision/prototype/transforms/functional/_geometry.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index b02ecb8b16b..c57d92f0b58 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -311,7 +311,7 @@ def _get_inverse_affine_matrix(
 
     # RSS without scaling
     a = math.cos(rot_sy) / cossy
-    b = -math.sin(rot) - a * tansx
+    b = -(a * tansx + math.sin(rot))
     c = math.sin(rot_sy) / cossy
     d = math.cos(rot) - c * tansx
 
@@ -1179,13 +1179,13 @@ def perspective_image_tensor(
         coeffs=perspective_coeffs,
     )
 
-    oh, ow = image.shape[-2:]
+    oh, ow = shape[-2:]
     dtype = image.dtype if fp else torch.float32
     grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
     output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
 
     if not fp:
-        output = output.to(image.dtype)
+        output = output.round_().to(image.dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)
@@ -1380,7 +1380,7 @@ def elastic_image_tensor(
     output = _apply_grid_transform(image if fp else image.to(torch.float32), grid, interpolation.value, fill=fill)
 
     if not fp:
-        output = output.to(image.dtype)
+        output = output.round_().to(image.dtype)
 
     if needs_unsquash:
         output = output.reshape(shape)

From 8e110f6912cb5468dfae546f47cb9940a61b3038 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 12:49:39 +0000
Subject: [PATCH 10/15] Bumping tolerance for rotate which is unrelated to this
 PR.

---
 test/prototype_transforms_kernel_infos.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py
index 361a921b18e..25daf3da59f 100644
--- a/test/prototype_transforms_kernel_infos.py
+++ b/test/prototype_transforms_kernel_infos.py
@@ -915,7 +915,7 @@ def sample_inputs_rotate_video():
             reference_inputs_fn=reference_inputs_rotate_image_tensor,
             float32_vs_uint8=True,
             # TODO: investigate
-            closeness_kwargs=pil_reference_pixel_difference(100, agg_method="mean"),
+            closeness_kwargs=pil_reference_pixel_difference(110, agg_method="mean"),
             test_marks=[
                 xfail_jit_tuple_instead_of_list("fill"),
                 # TODO: check if this is a regression since it seems that should be supported if `int` is ok

From 555df2d97b5ebb0142e8503698874ab261274862 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 12:54:59 +0000
Subject: [PATCH 11/15] Fix tolerance threshold for RandomPerspective.

---
 test/test_prototype_transforms_consistency.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 0cc52f8b838..61ebda0f4bb 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -401,6 +401,7 @@ def __init__(
             ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
             ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
         ],
+        closeness_kwargs={"atol": 1e-6, "rtol": 1e-6},
     ),
     ConsistencyConfig(
         prototype_transforms.RandomRotation,

From a32be72366ea072a34fff801afb9778951979b84 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 13:57:44 +0000
Subject: [PATCH 12/15] Clean up `_affine_grid` and `affine_image_tensor`

---
 .../transforms/functional/_geometry.py        | 61 +++++++++++++++++--
 1 file changed, 55 insertions(+), 6 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index c57d92f0b58..df8dcb0bde0 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -369,7 +369,7 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in
 
 
 def _apply_grid_transform(
-    float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
+    float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: features.FillTypeJIT
 ) -> torch.Tensor:
 
     shape = float_img.shape
@@ -405,7 +405,7 @@ def _assert_grid_transform_inputs(
     img: torch.Tensor,
     matrix: Optional[List[float]],
     interpolation: str,
-    fill: Optional[Union[int, float, List[float]]],
+    fill: features.FillTypeJIT,
     supported_interpolation_modes: List[str],
     coeffs: Optional[List[float]] = None,
 ) -> None:
@@ -434,6 +434,34 @@ def _assert_grid_transform_inputs(
         raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
 
 
+def _affine_grid(
+    theta: torch.Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> torch.Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+    dtype = theta.dtype
+    device = theta.device
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2).div_(torch.tensor([0.5 * w, 0.5 * h], dtype=dtype, device=device))
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
 def affine_image_tensor(
     image: torch.Tensor,
     angle: Union[int, float],
@@ -448,9 +476,19 @@ def affine_image_tensor(
         return image
 
     shape = image.shape
-    num_channels, height, width = shape[-3:]
-    image = image.reshape(-1, num_channels, height, width)
+    ndim = image.ndim
+    fp = torch.is_floating_point(image)
 
+    if ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+        needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    height, width = shape[-2:]
     angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
 
     center_f = [0.0, 0.0]
@@ -461,9 +499,20 @@ def affine_image_tensor(
     translate_f = [float(t) for t in translate]
     matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
 
-    output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill)
-    return output.reshape(shape)
+    _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+    dtype = image.dtype if fp else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+    grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height)
+    output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
+
+    if not fp:
+        output = output.round_().to(image.dtype)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
 
+    return output
 
 @torch.jit.unused
 def affine_image_pil(

From 311ff85ae693cdca795759d9b4325777a852d947 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 14:25:34 +0000
Subject: [PATCH 13/15] Clean up `rotate_image_tensor`

---
 .../transforms/functional/_geometry.py        | 30 ++++++++++++-------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index df8dcb0bde0..cf85a1e9d7c 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -402,7 +402,7 @@ def _apply_grid_transform(
 
 
 def _assert_grid_transform_inputs(
-    img: torch.Tensor,
+    image: torch.Tensor,
     matrix: Optional[List[float]],
     interpolation: str,
     fill: features.FillTypeJIT,
@@ -421,7 +421,7 @@ def _assert_grid_transform_inputs(
     if fill is not None:
         if isinstance(fill, (tuple, list)):
             length = len(fill)
-            num_channels = img.shape[-3]
+            num_channels = image.shape[-3]
             if length > 1 and length != num_channels:
                 raise ValueError(
                     "The number of elements in 'fill' cannot broadcast to match the number of "
@@ -754,18 +754,26 @@ def rotate_image_tensor(
     matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
 
     if image.numel() > 0:
-        image = _FT.rotate(
-            image.reshape(-1, num_channels, height, width),
-            matrix,
-            interpolation=interpolation.value,
-            expand=expand,
-            fill=fill,
-        )
-        new_height, new_width = image.shape[-2:]
+        fp = torch.is_floating_point(image)
+        image = image.reshape(-1, num_channels, height, width)
+
+        _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+        ow, oh = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
+        dtype = image.dtype if fp else torch.float32
+        theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+        grid = _affine_grid(theta, w=width, h=height, ow=ow, oh=oh)
+        output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill)
+
+        if not fp:
+            output = output.round_().to(image.dtype)
+
+        new_height, new_width = output.shape[-2:]
     else:
+        output = image
         new_width, new_height = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
 
-    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+    return output.reshape(shape[:-3] + (num_channels, new_height, new_width))
 
 
 @torch.jit.unused

From 6644006733b9825409809a8702fb1be907f3e60b Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 14:47:56 +0000
Subject: [PATCH 14/15] Fixing linter

---
 torchvision/prototype/transforms/functional/_geometry.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index cf85a1e9d7c..a980a43dddc 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -514,6 +514,7 @@ def affine_image_tensor(
 
     return output
 
+
 @torch.jit.unused
 def affine_image_pil(
     image: PIL.Image.Image,

From d3639e06de883b632a4aeea5a9978ec929a25f43 Mon Sep 17 00:00:00 2001
From: Vasilis Vryniotis <vvryniotis@fb.com>
Date: Mon, 14 Nov 2022 15:35:06 +0000
Subject: [PATCH 15/15] Address code-review comments.

---
 test/test_prototype_transforms_consistency.py |  2 +-
 .../transforms/functional/_geometry.py        | 21 +++++++++----------
 2 files changed, 11 insertions(+), 12 deletions(-)

diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py
index 61ebda0f4bb..d82d9ebea4f 100644
--- a/test/test_prototype_transforms_consistency.py
+++ b/test/test_prototype_transforms_consistency.py
@@ -401,7 +401,7 @@ def __init__(
             ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
             ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
         ],
-        closeness_kwargs={"atol": 1e-6, "rtol": 1e-6},
+        closeness_kwargs={"atol": None, "rtol": None},
     ),
     ConsistencyConfig(
         prototype_transforms.RandomRotation,
diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py
index a980a43dddc..41262185b5d 100644
--- a/torchvision/prototype/transforms/functional/_geometry.py
+++ b/torchvision/prototype/transforms/functional/_geometry.py
@@ -303,17 +303,17 @@ def _get_inverse_affine_matrix(
     tx, ty = translate
 
     # Cached results
-    cossy = math.cos(sy)
-    tansx = math.tan(sx)
-    rot_sy = rot - sy
+    cos_sy = math.cos(sy)
+    tan_sx = math.tan(sx)
+    rot_minus_sy = rot - sy
     cx_plus_tx = cx + tx
     cy_plus_ty = cy + ty
 
-    # RSS without scaling
-    a = math.cos(rot_sy) / cossy
-    b = -(a * tansx + math.sin(rot))
-    c = math.sin(rot_sy) / cossy
-    d = math.cos(rot) - c * tansx
+    # Rotate Scale Shear (RSS) without scaling
+    a = math.cos(rot_minus_sy) / cos_sy
+    b = -(a * tan_sx + math.sin(rot))
+    c = math.sin(rot_minus_sy) / cos_sy
+    d = math.cos(rot) - c * tan_sx
 
     if inverted:
         # Inverted rotation matrix with scale and shear
@@ -449,11 +449,10 @@ def _affine_grid(
     dtype = theta.dtype
     device = theta.device
 
-    d = 0.5
     base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
-    x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=device)
+    x_grid = torch.linspace((1.0 - ow) * 0.5, (ow - 1.0) * 0.5, steps=ow, device=device)
     base_grid[..., 0].copy_(x_grid)
-    y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=device).unsqueeze_(-1)
+    y_grid = torch.linspace((1.0 - oh) * 0.5, (oh - 1.0) * 0.5, steps=oh, device=device).unsqueeze_(-1)
     base_grid[..., 1].copy_(y_grid)
     base_grid[..., 2].fill_(1)