pytorch · NicolasHug · Jan 11, 2023 · Jan 12, 2023 · Jan 12, 2023 · Jan 13, 2023
diff --git a/torchvision/prototype/datapoints/_bounding_box.py b/torchvision/prototype/datapoints/_bounding_box.py
@@ -15,6 +15,31 @@ class BoundingBoxFormat(StrEnum):
     CXCYWH = StrEnum.auto()
 
 
+# What if... we just removed the format and spatial_size meta-data?
+# A: We could, but it comes with trade-offs. For the format, this wouldn't lead
+# to much of a difference, except that users would have to convert to XYXY
+# before doing anything. All of the current stable ops expect XYXY already so
+# it's not much of a change. Worth noting as well that a few BBox transforms
+# only have an implementation for the XYXY format, and they convert / re-convert
+# internally (see e.g. affine_bounding_box, but there are others)
+# Removing spatial_size however would make the dispatcher-level more cluncky for
+# users. It wouldn't change much of the tranforms classes as long as they're
+# called with their respective image e.g.
+# T(image, bbox)
+# because the spatial_size can be known from the image param. But in a mid-level
+# dispatcher which only accept 1 kind of input like
+# dispatcher(bbox)
+# there's no way to know the spatial_size unless it's passed as a parameter.
+# Users would also need to keep track of it since some transforms actually
+# change it:
+# bbox, sz = resize(bbox, spatial_size=sz)
+# This also means the mid-level dispatchers:
+# - need to accept as input anything that was a meta-data 9in this case
+#   spatial_size
+# - need to return them as well; which means they need to return either a single
+#   image, a single video, or a tuple of (bbox, spatial_size),
+# TL;DR: things would get messy for users and for us.
+
 class BoundingBox(Datapoint):
     format: BoundingBoxFormat  # TODO: do not use a builtin?
     # TODO: This is the size of the image, not the box. Maybe make this explicit in the name?

diff --git a/torchvision/prototype/transforms/_augment.py b/torchvision/prototype/transforms/_augment.py
@@ -94,7 +94,7 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
     def _transform(
         self, inpt: Union[datapoints.ImageType, datapoints.VideoType], params: Dict[str, Any]
     ) -> Union[datapoints.ImageType, datapoints.VideoType]:
-        if params["v"] is not None:
+        if params["v"] is not None:  # What is this?
 return 0, 0, img_h, img_w, img 
 if not inplace: 
     img = img.clone() 
 img[..., i : i + h, j : j + w] = v 
 return img 
 return 0, 0, img_h, img_w, img 
 if not inplace: 
     img = img.clone() 
  
 img[..., i : i + h, j : j + w] = v 
 return img 
             inpt = F.erase(inpt, **params, inplace=self.inplace)
 
         return inpt

diff --git a/torchvision/prototype/transforms/functional/_meta.py b/torchvision/prototype/transforms/functional/_meta.py
@@ -190,6 +190,7 @@ def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
     return xyxy
 
 
+# TODO: Maybe make this available as a class transform as well?
 def convert_format_bounding_box(
     bounding_box: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
 ) -> torch.Tensor:
@@ -437,6 +438,10 @@ def convert_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float) -
     return convert_dtype_image_tensor(video, dtype)
 
 
+# TODO: this doesn't just change the dtype, it also changes the value range.
+# This name relies on the implicit assumption that the value range is determined
+# by the dtype. Maybe think of a more descriptive name if we can (once and for
+# all)
 def convert_dtype(
     inpt: Union[datapoints.ImageTypeJIT, datapoints.VideoTypeJIT], dtype: torch.dtype = torch.float
 ) -> torch.Tensor: