From 3ac06adf29f7bfc0a5d06d91246375f1b052a036 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 11 Nov 2022 13:11:38 +0000 Subject: [PATCH 01/15] Clean up `_get_inverse_affine_matrix` and `_compute_affine_output_size` --- .../transforms/functional/_geometry.py | 109 ++++++++++++++++-- 1 file changed, 102 insertions(+), 7 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index ce97ce0575d..cb08a02aad0 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1,3 +1,4 @@ +import math import numbers import warnings from typing import List, Optional, Sequence, Tuple, Union @@ -10,7 +11,6 @@ from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT from torchvision.transforms.functional import ( _compute_resized_output_size as __compute_resized_output_size, - _get_inverse_affine_matrix, _get_perspective_coeffs, InterpolationMode, pil_modes_mapping, @@ -272,6 +272,102 @@ def _affine_parse_args( return angle, translate, shear, center +def _get_inverse_affine_matrix( + center: List[float], angle: float, translate: List[float], scale: float, shear: List[float], inverted: bool = True +) -> List[float]: + # Helper method to compute inverse matrix for affine transformation + + # Pillow requires inverse affine transformation matrix: + # Affine matrix is : M = T * C * RotateScaleShear * C^-1 + # + # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1] + # C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1] + # RotateScaleShear is rotation with scale and shear matrix + # + # RotateScaleShear(a, s, (sx, sy)) = + # = R(a) * S(s) * SHy(sy) * SHx(sx) + # = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ] + # [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ] + # [ 0 , 0 , 1 ] + # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears: + # SHx(s) = [1, -tan(s)] and SHy(s) = [1 , 0] + # [0, 1 ] [-tan(s), 1] + # + # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1 + + rot = math.radians(angle) + sx = math.radians(shear[0]) + sy = math.radians(shear[1]) + + cx, cy = center + tx, ty = translate + + # Cached results + cossy = math.cos(sy) + tansx = math.tan(sx) + rot_sy = rot - sy + cx_plus_tx = cx + tx + cy_plus_ty = cy + ty + + # RSS without scaling + a = math.cos(rot_sy) / cossy + b = -math.sin(rot) - a * tansx + c = math.sin(rot_sy) / cossy + d = math.cos(rot) - c * tansx + + if inverted: + # Inverted rotation matrix with scale and shear + # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1 + matrix = [d / scale, -b / scale, 0.0, -c / scale, a / scale, 0.0] + # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1 + # and then apply center translation: C * RSS^-1 * C^-1 * T^-1 + matrix[2] += cx - matrix[0] * cx_plus_tx - matrix[1] * cy_plus_ty + matrix[5] += cy - matrix[3] * cx_plus_tx - matrix[4] * cy_plus_ty + else: + matrix = [a * scale, b * scale, 0.0, c * scale, d * scale, 0.0] + # Apply inverse of center translation: RSS * C^-1 + # and then apply translation and center : T * C * RSS * C^-1 + matrix[2] += cx_plus_tx - matrix[0] * cx - matrix[1] * cy + matrix[5] += cy_plus_ty - matrix[3] * cx - matrix[4] * cy + + return matrix + + +def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]: + # Inspired of PIL implementation: + # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054 + + # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points. + # Points are shifted due to affine matrix torch convention about + # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5) + half_w = 0.5 * w + half_h = 0.5 * h + pts = torch.tensor( + [ + [-half_w, -half_h, 1.0], + [-half_w, half_h, 1.0], + [half_w, half_h, 1.0], + [half_w, -half_h, 1.0], + ] + ) + theta = torch.tensor(matrix, dtype=torch.float).view(2, 3) + new_pts = torch.matmul(pts, theta.T) + min_vals, max_vals = new_pts.aminmax(dim=0) + + # shift points to [0, w] and [0, h] interval to match PIL results + halfs = torch.tensor((half_w, half_h)) + min_vals.add_(halfs) + max_vals.add_(halfs) + + # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0 + tol = 1e-4 + inv_tol = 1.0 / tol + cmax = max_vals.mul_(inv_tol).trunc_().mul_(tol).ceil_() + cmin = min_vals.mul_(inv_tol).trunc_().mul_(tol).floor_() + size = cmax.sub_(cmin) + return int(size[0]), int(size[1]) # w, h + + def affine_image_tensor( image: torch.Tensor, angle: Union[int, float], @@ -395,7 +491,7 @@ def _affine_bounding_box_xyxy( out_bboxes.sub_(tr.repeat((1, 2))) # Estimate meta-data for image with inverted=True and with center=[0,0] affine_vector = _get_inverse_affine_matrix([0.0, 0.0], angle, translate, scale, shear) - new_width, new_height = _FT._compute_affine_output_size(affine_vector, width, height) + new_width, new_height = _compute_affine_output_size(affine_vector, width, height) spatial_size = (new_height, new_width) return out_bboxes.to(bounding_box.dtype), spatial_size @@ -552,7 +648,7 @@ def rotate_image_tensor( ) new_height, new_width = image.shape[-2:] else: - new_width, new_height = _FT._compute_affine_output_size(matrix, width, height) if expand else (width, height) + new_width, new_height = _compute_affine_output_size(matrix, width, height) if expand else (width, height) return image.reshape(shape[:-3] + (num_channels, new_height, new_width)) @@ -917,7 +1013,6 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1) # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1) # - # TODO: should we define them transposed? theta1 = torch.tensor( [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device ) @@ -932,8 +1027,9 @@ def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, base_grid[..., 2].fill_(1) rescaled_theta1 = theta1.transpose(1, 2).div_(torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device)) - output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1) - output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2)) + shape = (1, oh * ow, 3) + output_grid1 = base_grid.view(shape).bmm(rescaled_theta1) + output_grid2 = base_grid.view(shape).bmm(theta2.transpose(1, 2)) output_grid = output_grid1.div_(output_grid2).sub_(1.0) return output_grid.view(1, oh, ow, 2) @@ -1059,7 +1155,6 @@ def perspective_bounding_box( (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom, ] - # TODO: should we define them transposed? theta1 = torch.tensor( [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]], dtype=dtype, From 62deb434a83194f0cff72b0684d50c8d500ad02c Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 11 Nov 2022 17:15:51 +0000 Subject: [PATCH 02/15] Optimize `_apply_grid_transform` --- .../transforms/functional/_geometry.py | 61 +++++++++++++++++-- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index cb08a02aad0..57dcdc56825 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -5,7 +5,7 @@ import PIL.Image import torch -from torch.nn.functional import interpolate, pad as torch_pad +from torch.nn.functional import interpolate, grid_sample, pad as torch_pad from torchvision.prototype import features from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT @@ -368,6 +368,39 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in return int(size[0]), int(size[1]) # w, h +def _apply_grid_transform( + float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]] +) -> torch.Tensor: + + shape = float_img.shape + if shape[0] > 1: + # Apply same grid to a batch of images + grid = grid.expand(shape[0], -1, -1, -1) + + # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice + if fill is not None: + mask = torch.ones((shape[0], 1, shape[2], shape[3]), dtype=float_img.dtype, device=float_img.device) + float_img = torch.cat((float_img, mask), dim=1) + + float_img = grid_sample(float_img, grid, mode=mode, padding_mode="zeros", align_corners=False) + + # Fill with required color + if fill is not None: + float_img, mask = torch.tensor_split(float_img, indices=(-1,), dim=-3) + mask = mask.expand_as(float_img) + fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)] + fill_img = torch.tensor(fill_list, dtype=float_img.dtype, device=float_img.device).view(1, -1, 1, 1) + if mode == "nearest": + bool_mask = mask < 0.5 + float_img[bool_mask] = fill_img.expand_as(float_img)[bool_mask] + else: # 'bilinear' + # The following is mathematically equivalent to: + # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill + float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img) + + return float_img + + def affine_image_tensor( image: torch.Tensor, angle: Union[int, float], @@ -1065,10 +1098,15 @@ def perspective_image_tensor( return image shape = image.shape + ndim = image.ndim + fp = torch.is_floating_point(image) - if image.ndim > 4: + if ndim > 4: image = image.reshape((-1,) + shape[-3:]) needs_unsquash = True + elif ndim == 3: + image = image.unsqueeze(0) + needs_unsquash = True else: needs_unsquash = False @@ -1082,9 +1120,12 @@ def perspective_image_tensor( ) ow, oh = image.shape[-1], image.shape[-2] - dtype = image.dtype if torch.is_floating_point(image) else torch.float32 + dtype = image.dtype if fp else torch.float32 grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device) - output = _FT._apply_grid_transform(image, grid, interpolation.value, fill=fill) + output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) + + if not fp: + output = output.to(image.dtype) if needs_unsquash: output = output.reshape(shape) @@ -1261,17 +1302,25 @@ def elastic_image_tensor( return image shape = image.shape + ndim = image.ndim device = image.device + fp = torch.is_floating_point(image) - if image.ndim > 4: + if ndim > 4: image = image.reshape((-1,) + shape[-3:]) needs_unsquash = True + elif ndim == 3: + image = image.unsqueeze(0) + needs_unsquash = True else: needs_unsquash = False image_height, image_width = shape[-2:] grid = _create_identity_grid((image_height, image_width), device=device).add_(displacement.to(device)) - output = _FT._apply_grid_transform(image, grid, interpolation.value, fill) + output = _apply_grid_transform(image if fp else image.to(torch.float32), grid, interpolation.value, fill=fill) + + if not fp: + output = output.to(image.dtype) if needs_unsquash: output = output.reshape(shape) From 2f0d7638f6a9c5114f3ff632a83a19823c862488 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 11 Nov 2022 17:27:12 +0000 Subject: [PATCH 03/15] Cleanup `_assert_grid_transform_inputs` --- .../transforms/functional/_geometry.py | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 57dcdc56825..7de6d8dccb1 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -401,6 +401,39 @@ def _apply_grid_transform( return float_img +def _assert_grid_transform_inputs( + img: torch.Tensor, + matrix: Optional[List[float]], + interpolation: str, + fill: Optional[Union[int, float, List[float]]], + supported_interpolation_modes: List[str], + coeffs: Optional[List[float]] = None, +) -> None: + if matrix is not None: + if not isinstance(matrix, list): + raise TypeError("Argument matrix should be a list") + elif len(matrix) != 6: + raise ValueError("Argument matrix should have 6 float values") + + if coeffs is not None and len(coeffs) != 8: + raise ValueError("Argument coeffs should have 8 float values") + + if fill is not None: + if isinstance(fill, (tuple, list)): + length = len(fill) + num_channels = img.shape[-3] + if length > 1 and length != num_channels: + raise ValueError( + "The number of elements in 'fill' cannot broadcast to match the number of " + f"channels of the image ({length} != {num_channels})" + ) + elif not isinstance(fill, (int, float)): + raise ValueError("Argument fill should be either int, float, tuple or list") + + if interpolation not in supported_interpolation_modes: + raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input") + + def affine_image_tensor( image: torch.Tensor, angle: Union[int, float], @@ -1110,7 +1143,7 @@ def perspective_image_tensor( else: needs_unsquash = False - _FT._assert_grid_transform_inputs( + _assert_grid_transform_inputs( image, matrix=None, interpolation=interpolation.value, From dca1923ac0cd96f045f2764e6697510a1d5c5ea9 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 11 Nov 2022 18:20:22 +0000 Subject: [PATCH 04/15] Fix bugs on `_pad_with_scalar_fill` & `crop_mask` and port `crop_image_tensor` --- .../transforms/functional/_geometry.py | 96 ++++++++++++------- 1 file changed, 61 insertions(+), 35 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 7de6d8dccb1..08ee30ddd88 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -889,39 +889,38 @@ def _pad_with_scalar_fill( shape = image.shape num_channels, height, width = shape[-3:] - if image.numel() > 0: - image = image.reshape(-1, num_channels, height, width) - - if padding_mode == "edge": - # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map - # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad` - # name. - padding_mode = "replicate" - - if padding_mode == "constant": - image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill)) - elif padding_mode in ("reflect", "replicate"): - # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs. - # TODO: See https://github.com/pytorch/pytorch/issues/40763 - dtype = image.dtype - if not image.is_floating_point(): - needs_cast = True - image = image.to(torch.float32) - else: - needs_cast = False - - image = torch_pad(image, torch_padding, mode=padding_mode) - - if needs_cast: - image = image.to(dtype) - else: # padding_mode == "symmetric" - image = _FT._pad_symmetric(image, torch_padding) + batch_size = 1 + for s in shape[:-3]: + batch_size *= s + + image = image.reshape(batch_size, num_channels, height, width) + + if padding_mode == "edge": + # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map + # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad` + # name. + padding_mode = "replicate" + + if padding_mode == "constant": + image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill)) + elif padding_mode in ("reflect", "replicate"): + # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs. + # TODO: See https://github.com/pytorch/pytorch/issues/40763 + dtype = image.dtype + if not image.is_floating_point(): + needs_cast = True + image = image.to(torch.float32) + else: + needs_cast = False - new_height, new_width = image.shape[-2:] - else: - left, right, top, bottom = torch_padding - new_height = height + top + bottom - new_width = width + left + right + image = torch_pad(image, torch_padding, mode=padding_mode) + + if needs_cast: + image = image.to(dtype) + else: # padding_mode == "symmetric" + image = _FT._pad_symmetric(image, torch_padding) + + new_height, new_width = image.shape[-2:] return image.reshape(shape[:-3] + (num_channels, new_height, new_width)) @@ -1030,7 +1029,23 @@ def pad( return pad_image_pil(inpt, padding, fill=fill, padding_mode=padding_mode) -crop_image_tensor = _FT.crop +def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: + h, w = image.shape[-2:] + + right = left + width + bottom = top + height + + if left < 0 or top < 0 or right > w or bottom > h: + padding_ltrb = [ + max(min(right, 0) - left, 0), + max(min(bottom, 0) - top, 0), + max(right - max(w, left), 0), + max(bottom - max(h, top), 0), + ] + return pad_image_tensor(image[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb) + return image[..., top:bottom, left:right] + + crop_image_pil = _FP.crop @@ -1055,7 +1070,18 @@ def crop_bounding_box( def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: - return crop_image_tensor(mask, top, left, height, width) + if mask.ndim < 3: + mask = mask.unsqueeze(0) + needs_squeeze = True + else: + needs_squeeze = False + + output = crop_image_tensor(mask, top, left, height, width) + + if needs_squeeze: + output = output.squeeze(0) + + return output def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor: @@ -1152,7 +1178,7 @@ def perspective_image_tensor( coeffs=perspective_coeffs, ) - ow, oh = image.shape[-1], image.shape[-2] + oh, ow = image.shape[-2:] dtype = image.dtype if fp else torch.float32 grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device) output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) From b5548ecd13b5c9fa2788f52e3d68fa6e5d1a9e8b Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 11 Nov 2022 18:39:52 +0000 Subject: [PATCH 05/15] Call directly `_pad_with_scalar_fill` --- torchvision/prototype/transforms/functional/_geometry.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 08ee30ddd88..89c243ce798 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1036,13 +1036,14 @@ def crop_image_tensor(image: torch.Tensor, top: int, left: int, height: int, wid bottom = top + height if left < 0 or top < 0 or right > w or bottom > h: - padding_ltrb = [ + image = image[..., max(top, 0) : bottom, max(left, 0) : right] + torch_padding = [ max(min(right, 0) - left, 0), - max(min(bottom, 0) - top, 0), max(right - max(w, left), 0), + max(min(bottom, 0) - top, 0), max(bottom - max(h, top), 0), ] - return pad_image_tensor(image[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb) + return _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant") return image[..., top:bottom, left:right] From 709b34a49ed607e8b84c5667ec4d9cc92a2fdacc Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 11 Nov 2022 18:50:33 +0000 Subject: [PATCH 06/15] Fix linter --- torchvision/prototype/transforms/functional/_geometry.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index 89c243ce798..d92a8f6869c 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -5,7 +5,7 @@ import PIL.Image import torch -from torch.nn.functional import interpolate, grid_sample, pad as torch_pad +from torch.nn.functional import grid_sample, interpolate, pad as torch_pad from torchvision.prototype import features from torchvision.transforms import functional_pil as _FP, functional_tensor as _FT @@ -369,8 +369,8 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in def _apply_grid_transform( - float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]] -) -> torch.Tensor: + float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]] +) -> torch.Tensor: shape = float_img.shape if shape[0] > 1: From b9a6e74dd9bb68aa5059a88055f124c68c51b040 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Fri, 11 Nov 2022 18:56:56 +0000 Subject: [PATCH 07/15] Clean up `center_crop_image_tensor` --- torchvision/prototype/transforms/functional/_geometry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index d92a8f6869c..b44a926c3ae 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -1538,7 +1538,7 @@ def center_crop_image_tensor(image: torch.Tensor, output_size: List[int]) -> tor if crop_height > image_height or crop_width > image_width: padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) - image = _FT.torch_pad(image, _FT._parse_pad_padding(padding_ltrb), value=0.0) + image = torch_pad(image, _parse_pad_padding(padding_ltrb), value=0.0) image_height, image_width = image.shape[-2:] if crop_width == image_width and crop_height == image_height: From 62b9d47be6582114101617179ee43bd8a1fa4647 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 10:36:46 +0000 Subject: [PATCH 08/15] Fix comments. --- torchvision/prototype/transforms/functional/_geometry.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index b44a926c3ae..b02ecb8b16b 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -287,7 +287,7 @@ def _get_inverse_affine_matrix( # RotateScaleShear(a, s, (sx, sy)) = # = R(a) * S(s) * SHy(sy) * SHx(sx) # = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ] - # [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ] + # [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ] # [ 0 , 0 , 1 ] # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears: # SHx(s) = [1, -tan(s)] and SHy(s) = [1 , 0] From b3a0bb180f9febc5a7db85c2b4265791c2515060 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 12:19:41 +0000 Subject: [PATCH 09/15] Fixing rounding issues. --- torchvision/prototype/transforms/functional/_geometry.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index b02ecb8b16b..c57d92f0b58 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -311,7 +311,7 @@ def _get_inverse_affine_matrix( # RSS without scaling a = math.cos(rot_sy) / cossy - b = -math.sin(rot) - a * tansx + b = -(a * tansx + math.sin(rot)) c = math.sin(rot_sy) / cossy d = math.cos(rot) - c * tansx @@ -1179,13 +1179,13 @@ def perspective_image_tensor( coeffs=perspective_coeffs, ) - oh, ow = image.shape[-2:] + oh, ow = shape[-2:] dtype = image.dtype if fp else torch.float32 grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device) output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) if not fp: - output = output.to(image.dtype) + output = output.round_().to(image.dtype) if needs_unsquash: output = output.reshape(shape) @@ -1380,7 +1380,7 @@ def elastic_image_tensor( output = _apply_grid_transform(image if fp else image.to(torch.float32), grid, interpolation.value, fill=fill) if not fp: - output = output.to(image.dtype) + output = output.round_().to(image.dtype) if needs_unsquash: output = output.reshape(shape) From 8e110f6912cb5468dfae546f47cb9940a61b3038 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 12:49:39 +0000 Subject: [PATCH 10/15] Bumping tolerance for rotate which is unrelated to this PR. --- test/prototype_transforms_kernel_infos.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/prototype_transforms_kernel_infos.py b/test/prototype_transforms_kernel_infos.py index 361a921b18e..25daf3da59f 100644 --- a/test/prototype_transforms_kernel_infos.py +++ b/test/prototype_transforms_kernel_infos.py @@ -915,7 +915,7 @@ def sample_inputs_rotate_video(): reference_inputs_fn=reference_inputs_rotate_image_tensor, float32_vs_uint8=True, # TODO: investigate - closeness_kwargs=pil_reference_pixel_difference(100, agg_method="mean"), + closeness_kwargs=pil_reference_pixel_difference(110, agg_method="mean"), test_marks=[ xfail_jit_tuple_instead_of_list("fill"), # TODO: check if this is a regression since it seems that should be supported if `int` is ok From 555df2d97b5ebb0142e8503698874ab261274862 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 12:54:59 +0000 Subject: [PATCH 11/15] Fix tolerance threshold for RandomPerspective. --- test/test_prototype_transforms_consistency.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 0cc52f8b838..61ebda0f4bb 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -401,6 +401,7 @@ def __init__( ArgsKwargs(p=1, distortion_scale=0.1, fill=1), ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)), ], + closeness_kwargs={"atol": 1e-6, "rtol": 1e-6}, ), ConsistencyConfig( prototype_transforms.RandomRotation, From a32be72366ea072a34fff801afb9778951979b84 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 13:57:44 +0000 Subject: [PATCH 12/15] Clean up `_affine_grid` and `affine_image_tensor` --- .../transforms/functional/_geometry.py | 61 +++++++++++++++++-- 1 file changed, 55 insertions(+), 6 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index c57d92f0b58..df8dcb0bde0 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -369,7 +369,7 @@ def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[in def _apply_grid_transform( - float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: Optional[Union[int, float, List[float]]] + float_img: torch.Tensor, grid: torch.Tensor, mode: str, fill: features.FillTypeJIT ) -> torch.Tensor: shape = float_img.shape @@ -405,7 +405,7 @@ def _assert_grid_transform_inputs( img: torch.Tensor, matrix: Optional[List[float]], interpolation: str, - fill: Optional[Union[int, float, List[float]]], + fill: features.FillTypeJIT, supported_interpolation_modes: List[str], coeffs: Optional[List[float]] = None, ) -> None: @@ -434,6 +434,34 @@ def _assert_grid_transform_inputs( raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input") +def _affine_grid( + theta: torch.Tensor, + w: int, + h: int, + ow: int, + oh: int, +) -> torch.Tensor: + # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/ + # AffineGridGenerator.cpp#L18 + # Difference with AffineGridGenerator is that: + # 1) we normalize grid values after applying theta + # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate + dtype = theta.dtype + device = theta.device + + d = 0.5 + base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device) + x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=device) + base_grid[..., 0].copy_(x_grid) + y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=device).unsqueeze_(-1) + base_grid[..., 1].copy_(y_grid) + base_grid[..., 2].fill_(1) + + rescaled_theta = theta.transpose(1, 2).div_(torch.tensor([0.5 * w, 0.5 * h], dtype=dtype, device=device)) + output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta) + return output_grid.view(1, oh, ow, 2) + + def affine_image_tensor( image: torch.Tensor, angle: Union[int, float], @@ -448,9 +476,19 @@ def affine_image_tensor( return image shape = image.shape - num_channels, height, width = shape[-3:] - image = image.reshape(-1, num_channels, height, width) + ndim = image.ndim + fp = torch.is_floating_point(image) + if ndim > 4: + image = image.reshape((-1,) + shape[-3:]) + needs_unsquash = True + elif ndim == 3: + image = image.unsqueeze(0) + needs_unsquash = True + else: + needs_unsquash = False + + height, width = shape[-2:] angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center) center_f = [0.0, 0.0] @@ -461,9 +499,20 @@ def affine_image_tensor( translate_f = [float(t) for t in translate] matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear) - output = _FT.affine(image, matrix, interpolation=interpolation.value, fill=fill) - return output.reshape(shape) + _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"]) + + dtype = image.dtype if fp else torch.float32 + theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3) + grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height) + output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) + + if not fp: + output = output.round_().to(image.dtype) + + if needs_unsquash: + output = output.reshape(shape) + return output @torch.jit.unused def affine_image_pil( From 311ff85ae693cdca795759d9b4325777a852d947 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 14:25:34 +0000 Subject: [PATCH 13/15] Clean up `rotate_image_tensor` --- .../transforms/functional/_geometry.py | 30 ++++++++++++------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index df8dcb0bde0..cf85a1e9d7c 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -402,7 +402,7 @@ def _apply_grid_transform( def _assert_grid_transform_inputs( - img: torch.Tensor, + image: torch.Tensor, matrix: Optional[List[float]], interpolation: str, fill: features.FillTypeJIT, @@ -421,7 +421,7 @@ def _assert_grid_transform_inputs( if fill is not None: if isinstance(fill, (tuple, list)): length = len(fill) - num_channels = img.shape[-3] + num_channels = image.shape[-3] if length > 1 and length != num_channels: raise ValueError( "The number of elements in 'fill' cannot broadcast to match the number of " @@ -754,18 +754,26 @@ def rotate_image_tensor( matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0]) if image.numel() > 0: - image = _FT.rotate( - image.reshape(-1, num_channels, height, width), - matrix, - interpolation=interpolation.value, - expand=expand, - fill=fill, - ) - new_height, new_width = image.shape[-2:] + fp = torch.is_floating_point(image) + image = image.reshape(-1, num_channels, height, width) + + _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"]) + + ow, oh = _compute_affine_output_size(matrix, width, height) if expand else (width, height) + dtype = image.dtype if fp else torch.float32 + theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3) + grid = _affine_grid(theta, w=width, h=height, ow=ow, oh=oh) + output = _apply_grid_transform(image if fp else image.to(dtype), grid, interpolation.value, fill=fill) + + if not fp: + output = output.round_().to(image.dtype) + + new_height, new_width = output.shape[-2:] else: + output = image new_width, new_height = _compute_affine_output_size(matrix, width, height) if expand else (width, height) - return image.reshape(shape[:-3] + (num_channels, new_height, new_width)) + return output.reshape(shape[:-3] + (num_channels, new_height, new_width)) @torch.jit.unused From 6644006733b9825409809a8702fb1be907f3e60b Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 14:47:56 +0000 Subject: [PATCH 14/15] Fixing linter --- torchvision/prototype/transforms/functional/_geometry.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index cf85a1e9d7c..a980a43dddc 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -514,6 +514,7 @@ def affine_image_tensor( return output + @torch.jit.unused def affine_image_pil( image: PIL.Image.Image, From d3639e06de883b632a4aeea5a9978ec929a25f43 Mon Sep 17 00:00:00 2001 From: Vasilis Vryniotis Date: Mon, 14 Nov 2022 15:35:06 +0000 Subject: [PATCH 15/15] Address code-review comments. --- test/test_prototype_transforms_consistency.py | 2 +- .../transforms/functional/_geometry.py | 21 +++++++++---------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/test/test_prototype_transforms_consistency.py b/test/test_prototype_transforms_consistency.py index 61ebda0f4bb..d82d9ebea4f 100644 --- a/test/test_prototype_transforms_consistency.py +++ b/test/test_prototype_transforms_consistency.py @@ -401,7 +401,7 @@ def __init__( ArgsKwargs(p=1, distortion_scale=0.1, fill=1), ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)), ], - closeness_kwargs={"atol": 1e-6, "rtol": 1e-6}, + closeness_kwargs={"atol": None, "rtol": None}, ), ConsistencyConfig( prototype_transforms.RandomRotation, diff --git a/torchvision/prototype/transforms/functional/_geometry.py b/torchvision/prototype/transforms/functional/_geometry.py index a980a43dddc..41262185b5d 100644 --- a/torchvision/prototype/transforms/functional/_geometry.py +++ b/torchvision/prototype/transforms/functional/_geometry.py @@ -303,17 +303,17 @@ def _get_inverse_affine_matrix( tx, ty = translate # Cached results - cossy = math.cos(sy) - tansx = math.tan(sx) - rot_sy = rot - sy + cos_sy = math.cos(sy) + tan_sx = math.tan(sx) + rot_minus_sy = rot - sy cx_plus_tx = cx + tx cy_plus_ty = cy + ty - # RSS without scaling - a = math.cos(rot_sy) / cossy - b = -(a * tansx + math.sin(rot)) - c = math.sin(rot_sy) / cossy - d = math.cos(rot) - c * tansx + # Rotate Scale Shear (RSS) without scaling + a = math.cos(rot_minus_sy) / cos_sy + b = -(a * tan_sx + math.sin(rot)) + c = math.sin(rot_minus_sy) / cos_sy + d = math.cos(rot) - c * tan_sx if inverted: # Inverted rotation matrix with scale and shear @@ -449,11 +449,10 @@ def _affine_grid( dtype = theta.dtype device = theta.device - d = 0.5 base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device) - x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=device) + x_grid = torch.linspace((1.0 - ow) * 0.5, (ow - 1.0) * 0.5, steps=ow, device=device) base_grid[..., 0].copy_(x_grid) - y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=device).unsqueeze_(-1) + y_grid = torch.linspace((1.0 - oh) * 0.5, (oh - 1.0) * 0.5, steps=oh, device=device).unsqueeze_(-1) base_grid[..., 1].copy_(y_grid) base_grid[..., 2].fill_(1)