pytorch · felipemello1 · Jul 5, 2024 · Jun 13, 2024 · Jun 13, 2024 · Jun 13, 2024
diff --git a/docs/source/api_ref_modules.rst b/docs/source/api_ref_modules.rst
@@ -79,3 +79,17 @@ Loss
    :nosignatures:
 
    loss.DPOLoss
+
+
+Vision Transforms
+------------------
+Functions used for preprocessing images.
+
+.. autosummary::
+   :toctree: generated/
+   :nosignatures:
+
+    transforms.get_canvas_best_fit
+    transforms.resize_with_pad
+    transforms.tile_crop
+    transforms.find_supported_resolutions
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,6 +10,9 @@ authors = [
 ]
 keywords = ["pytorch", "finetuning", "llm"]
 dependencies = [
+    # multimodality
+    "torchvision",
+
     # Hugging Face integrations
     "datasets",
     "huggingface_hub",

diff --git a/tests/torchtune/models/clip/__init__.py b/tests/torchtune/models/clip/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/tests/torchtune/models/clip/test_clip_image_transform.py b/tests/torchtune/models/clip/test_clip_image_transform.py
@@ -0,0 +1,75 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import PIL
+import pytest
+
+import torch
+
+from torchtune.models.clip._transforms import CLIPImageTransform
+
+
+class TestPipelines:
+    @pytest.mark.parametrize(
+        "params",
+        [
+            {
+                "image_size": (100, 100, 3),
+                "expected_shape": torch.Size([1, 3, 224, 224]),
+                "resize_to_max_canvas": False,
+            },
+            {
+                "image_size": (200, 300),
+                "expected_shape": torch.Size([4, 3, 224, 224]),
+                "resize_to_max_canvas": True,
+            },
+            {
+                "image_size": (100, 200, 3),
+                "expected_shape": torch.Size([2, 3, 224, 224]),
+                "resize_to_max_canvas": True,
+            },
+            {
+                "image_size": (100, 200),
+                "expected_shape": torch.Size([1, 3, 224, 224]),
+                "resize_to_max_canvas": False,
+            },
+        ],
+    )
+    def test_clip_image_transform(self, params):
+
+        image_transform = CLIPImageTransform(
+            image_mean=None,
+            image_std=None,
+            tile_size=224,
+            possible_resolutions=None,
+            max_num_tiles=4,
+            resample="bilinear",
+            resize_to_max_canvas=params["resize_to_max_canvas"],
+        )
+
+        image_size = params["image_size"]
+
+        # Create a random image
+        image = (np.random.rand(*image_size) * 255).astype(np.uint8)  # type: ignore
+        image = PIL.Image.fromarray(image)  # type: ignore
+
+        output = image_transform(image=image)
+        output_image = output["image"]
+        output_ar = output["aspect_ratio"]
+
+        assert (
+            output_image.shape == params["expected_shape"]
+        ), f"Expected shape {params['expected_shape']} but got {output_image.shape}"
+
+        assert (
+            0 <= output_image.min() <= output_image.max() <= 1
+        ), f"Expected pixel values to be in range [0, 1] but got {output_image.min()} and {output_image.max()}"
+
+        expected_num_tiles = output_ar[0] * output_ar[1]
+        assert (
+            expected_num_tiles == output_image.shape[0]
+        ), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}"
diff --git a/tests/torchtune/modules/transforms/test_get_canvas_best_fit.py b/tests/torchtune/modules/transforms/test_get_canvas_best_fit.py
@@ -0,0 +1,160 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+
+from torchtune.modules.transforms import find_supported_resolutions, get_canvas_best_fit
+
+
+class TestUtils:
+    @pytest.mark.parametrize(
+        "params",
+        [
+            {
+                "max_num_tiles": 1,
+                "tile_size": 224,
+                "expected_resolutions": [(224, 224)],
+            },
+            {
+                "max_num_tiles": 2,
+                "tile_size": 100,
+                "expected_resolutions": [(100, 200), (200, 100), (100, 100)],
+            },
+            {
+                "max_num_tiles": 3,
+                "tile_size": 50,
+                "expected_resolutions": [
+                    (50, 150),
+                    (150, 50),
+                    (50, 100),
+                    (100, 50),
+                    (50, 50),
+                ],
+            },
+            {
+                "max_num_tiles": 4,
+                "tile_size": 300,
+                "expected_resolutions": [
+                    (300, 1200),
+                    (600, 600),
+                    (300, 300),
+                    (1200, 300),
+                    (300, 900),
+                    (900, 300),
+                    (300, 600),
+                    (600, 300),
+                ],
+            },
+        ],
+    )
+    def test_find_supported_resolutions(self, params):
+        max_num_tiles = params["max_num_tiles"]
+        tile_size = params["tile_size"]
+        expected_resolutions = params["expected_resolutions"]
+        resolutions = find_supported_resolutions(max_num_tiles, tile_size)
+
+        assert len(set(resolutions)) == len(resolutions), "Resolutions should be unique"
+        assert set(resolutions) == set(
+            expected_resolutions
+        ), f"Expected resolutions {expected_resolutions} but got {resolutions}"
+
+    @pytest.mark.parametrize(
+        "params",
+        [
+            {
+                "image_size": (800, 600),
+                "possible_resolutions": [
+                    (224, 896),
+                    (448, 448),
+                    (224, 224),
+                    (896, 224),
+                    (224, 672),
+                    (672, 224),
+                    (224, 448),
+                    (448, 224),
+                ],
+                "resize_to_max_canvax": False,
+                "expected_best_resolution": (448, 448),
+            },
+            {
+                "image_size": (200, 300),
+                "possible_resolutions": [
+                    (224, 896),
+                    (448, 448),
+                    (224, 224),
+                    (896, 224),
+                    (224, 672),
+                    (672, 224),
+                    (224, 448),
+                    (448, 224),
+                ],
+                "resize_to_max_canvax": False,
+                "expected_best_resolution": (224, 448),
+            },
+            {
+                "image_size": (200, 500),
+                "possible_resolutions": [
+                    (224, 896),
+                    (448, 448),
+                    (224, 224),
+                    (896, 224),
+                    (224, 672),
+                    (672, 224),
+                    (224, 448),
+                    (448, 224),
+                ],
+                "resize_to_max_canvax": True,
+                "expected_best_resolution": (224, 672),
+            },
+            {
+                "image_size": (200, 200),
+                "possible_resolutions": [
+                    (224, 896),
+                    (448, 448),
+                    (224, 224),
+                    (896, 224),
+                    (224, 672),
+                    (672, 224),
+                    (224, 448),
+                    (448, 224),
+                ],
+                "resize_to_max_canvax": False,
+                "expected_best_resolution": (224, 224),
+            },
+            {
+                "image_size": (200, 100),
+                "possible_resolutions": [
+                    (224, 896),
+                    (448, 448),
+                    (224, 224),
+                    (896, 224),
+                    (224, 672),
+                    (672, 224),
+                    (224, 448),
+                    (448, 224),
+                ],
+                "resize_to_max_canvax": True,
+                "expected_best_resolution": (448, 224),
+            },
+        ],
+    )
+    def test_get_canvas_best_fit(self, params):
+        image_size = params["image_size"]
+        possible_resolutions = params["possible_resolutions"]
+        expected_best_resolution = params["expected_best_resolution"]
+        resize_to_max_canvax = params["resize_to_max_canvax"]
+
+        possible_resolutions = torch.tensor(possible_resolutions)
+
+        image = torch.rand(*image_size)
+        best_resolution = get_canvas_best_fit(
+            image, possible_resolutions, resize_to_max_canvax
+        )
+
+        assert (
+            tuple(best_resolution) == expected_best_resolution
+        ), f"Expected best resolution {expected_best_resolution} but got {best_resolution}"
diff --git a/tests/torchtune/modules/transforms/test_resize_with_pad.py b/tests/torchtune/modules/transforms/test_resize_with_pad.py
@@ -0,0 +1,84 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+
+import torch
+import torchvision
+
+from torchtune.modules.transforms import resize_with_pad
+
+
+class TestTransforms:
+    @pytest.mark.parametrize(
+        "params",
+        [
+            {
+                "image_size": (200, 100),
+                "target_size": (1000, 1200),
+                "max_upscaling_size": 600,
+                "expected_resized_size": (600, 300),
+            },
+            {
+                "image_size": (2000, 200),
+                "target_size": (1000, 1200),
+                "max_upscaling_size": 600,
+                "expected_resized_size": (1000, 100),
+            },
+            {
+                "image_size": (400, 200),
+                "target_size": (1000, 1200),
+                "max_upscaling_size": 2000,
+                "expected_resized_size": (1000, 500),
+            },
+            {
+                "image_size": (400, 200),
+                "target_size": (1000, 1200),
+                "max_upscaling_size": None,
+                "expected_resized_size": (1000, 500),
+            },
+            {
+                "image_size": (1000, 500),
+                "target_size": (400, 300),
+                "max_upscaling_size": None,
+                "expected_resized_size": [400, 200],
+            },
+        ],
+    )
+    def test_resize_with_pad(self, params):
+
+        image_size = params["image_size"]
+        target_size = params["target_size"]
+        max_upscaling_size = params["max_upscaling_size"]
+        expected_resized_size = params["expected_resized_size"]
+
+        image = torch.rand(3, *image_size)  # Create a random image tensor
+
+        resized_image = resize_with_pad(
+            image=image,
+            target_size=target_size,
+            resample=torchvision.transforms.InterpolationMode["BILINEAR"],
+            max_upscaling_size=max_upscaling_size,
+        )
+
+        # assert everything beyond resize has value == 0
+        assert torch.all(
+            resized_image[:, (expected_resized_size[0] + 1) :, :] == 0
+        ), "Expected everything beyond resize to be pad with fill=0"
+
+        assert torch.all(
+            resized_image[:, :, (expected_resized_size[1] + 1) :] == 0
+        ), "Expected everything beyond resize to be pad with fill=0"
+
+        assert torch.all(
+            resized_image[:, : expected_resized_size[0], : expected_resized_size[1]]
+            != 0
+        ), "Expected no padding where the image is supposed to be"
+
+        # output should have shape target_size
+        assert (
+            resized_image.shape[-2:] == target_size
+        ), f"Expected output with shape {target_size} but got {resized_image.shape[-2:]}"