Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CLIP][IMAGE TRANSFORMS] Image transforms for clip encoder #1084

Merged
merged 28 commits into from
Jul 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/source/api_ref_modules.rst
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,17 @@ Loss
:nosignatures:

loss.DPOLoss


Vision Transforms
------------------
Functions used for preprocessing images.

.. autosummary::
:toctree: generated/
:nosignatures:

transforms.get_canvas_best_fit
transforms.resize_with_pad
transforms.tile_crop
transforms.find_supported_resolutions
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ authors = [
]
keywords = ["pytorch", "finetuning", "llm"]
dependencies = [
# multimodality
"torchvision",

# Hugging Face integrations
"datasets",
"huggingface_hub",
Expand Down
5 changes: 5 additions & 0 deletions tests/torchtune/models/clip/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.
75 changes: 75 additions & 0 deletions tests/torchtune/models/clip/test_clip_image_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import numpy as np
import PIL
import pytest

import torch

from torchtune.models.clip._transforms import CLIPImageTransform


class TestPipelines:
@pytest.mark.parametrize(
"params",
[
{
"image_size": (100, 100, 3),
"expected_shape": torch.Size([1, 3, 224, 224]),
"resize_to_max_canvas": False,
},
{
"image_size": (200, 300),
"expected_shape": torch.Size([4, 3, 224, 224]),
"resize_to_max_canvas": True,
},
{
"image_size": (100, 200, 3),
"expected_shape": torch.Size([2, 3, 224, 224]),
"resize_to_max_canvas": True,
},
{
"image_size": (100, 200),
"expected_shape": torch.Size([1, 3, 224, 224]),
"resize_to_max_canvas": False,
},
],
)
def test_clip_image_transform(self, params):

image_transform = CLIPImageTransform(
image_mean=None,
image_std=None,
tile_size=224,
possible_resolutions=None,
max_num_tiles=4,
resample="bilinear",
resize_to_max_canvas=params["resize_to_max_canvas"],
)

image_size = params["image_size"]

# Create a random image
image = (np.random.rand(*image_size) * 255).astype(np.uint8) # type: ignore
image = PIL.Image.fromarray(image) # type: ignore

output = image_transform(image=image)
output_image = output["image"]
output_ar = output["aspect_ratio"]

assert (
output_image.shape == params["expected_shape"]
), f"Expected shape {params['expected_shape']} but got {output_image.shape}"

assert (
0 <= output_image.min() <= output_image.max() <= 1
), f"Expected pixel values to be in range [0, 1] but got {output_image.min()} and {output_image.max()}"

expected_num_tiles = output_ar[0] * output_ar[1]
assert (
expected_num_tiles == output_image.shape[0]
), f"Expected {expected_num_tiles} tiles but got {output_image.shape[0]}"
160 changes: 160 additions & 0 deletions tests/torchtune/modules/transforms/test_get_canvas_best_fit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import pytest
import torch

from torchtune.modules.transforms import find_supported_resolutions, get_canvas_best_fit


class TestUtils:
@pytest.mark.parametrize(
"params",
[
{
"max_num_tiles": 1,
"tile_size": 224,
"expected_resolutions": [(224, 224)],
},
{
"max_num_tiles": 2,
"tile_size": 100,
"expected_resolutions": [(100, 200), (200, 100), (100, 100)],
},
{
"max_num_tiles": 3,
"tile_size": 50,
"expected_resolutions": [
(50, 150),
(150, 50),
(50, 100),
(100, 50),
(50, 50),
],
},
{
"max_num_tiles": 4,
"tile_size": 300,
"expected_resolutions": [
(300, 1200),
(600, 600),
(300, 300),
(1200, 300),
(300, 900),
(900, 300),
(300, 600),
(600, 300),
],
},
],
)
def test_find_supported_resolutions(self, params):
max_num_tiles = params["max_num_tiles"]
tile_size = params["tile_size"]
expected_resolutions = params["expected_resolutions"]
resolutions = find_supported_resolutions(max_num_tiles, tile_size)

assert len(set(resolutions)) == len(resolutions), "Resolutions should be unique"
assert set(resolutions) == set(
expected_resolutions
), f"Expected resolutions {expected_resolutions} but got {resolutions}"

@pytest.mark.parametrize(
"params",
[
{
"image_size": (800, 600),
"possible_resolutions": [
(224, 896),
(448, 448),
(224, 224),
(896, 224),
(224, 672),
(672, 224),
(224, 448),
(448, 224),
],
"resize_to_max_canvax": False,
"expected_best_resolution": (448, 448),
},
{
"image_size": (200, 300),
"possible_resolutions": [
(224, 896),
(448, 448),
(224, 224),
(896, 224),
(224, 672),
(672, 224),
(224, 448),
(448, 224),
],
"resize_to_max_canvax": False,
"expected_best_resolution": (224, 448),
},
{
"image_size": (200, 500),
"possible_resolutions": [
(224, 896),
(448, 448),
(224, 224),
(896, 224),
(224, 672),
(672, 224),
(224, 448),
(448, 224),
],
"resize_to_max_canvax": True,
"expected_best_resolution": (224, 672),
},
{
"image_size": (200, 200),
"possible_resolutions": [
(224, 896),
(448, 448),
(224, 224),
(896, 224),
(224, 672),
(672, 224),
(224, 448),
(448, 224),
],
"resize_to_max_canvax": False,
"expected_best_resolution": (224, 224),
},
{
"image_size": (200, 100),
"possible_resolutions": [
(224, 896),
(448, 448),
(224, 224),
(896, 224),
(224, 672),
(672, 224),
(224, 448),
(448, 224),
],
"resize_to_max_canvax": True,
"expected_best_resolution": (448, 224),
},
],
)
def test_get_canvas_best_fit(self, params):
image_size = params["image_size"]
possible_resolutions = params["possible_resolutions"]
expected_best_resolution = params["expected_best_resolution"]
resize_to_max_canvax = params["resize_to_max_canvax"]

possible_resolutions = torch.tensor(possible_resolutions)

image = torch.rand(*image_size)
best_resolution = get_canvas_best_fit(
image, possible_resolutions, resize_to_max_canvax
)

assert (
tuple(best_resolution) == expected_best_resolution
), f"Expected best resolution {expected_best_resolution} but got {best_resolution}"
84 changes: 84 additions & 0 deletions tests/torchtune/modules/transforms/test_resize_with_pad.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

import pytest

import torch
import torchvision

from torchtune.modules.transforms import resize_with_pad


class TestTransforms:
@pytest.mark.parametrize(
"params",
[
{
"image_size": (200, 100),
"target_size": (1000, 1200),
"max_upscaling_size": 600,
"expected_resized_size": (600, 300),
},
{
"image_size": (2000, 200),
"target_size": (1000, 1200),
"max_upscaling_size": 600,
"expected_resized_size": (1000, 100),
},
{
"image_size": (400, 200),
"target_size": (1000, 1200),
"max_upscaling_size": 2000,
"expected_resized_size": (1000, 500),
},
{
"image_size": (400, 200),
"target_size": (1000, 1200),
"max_upscaling_size": None,
"expected_resized_size": (1000, 500),
},
{
"image_size": (1000, 500),
"target_size": (400, 300),
"max_upscaling_size": None,
"expected_resized_size": [400, 200],
},
],
)
def test_resize_with_pad(self, params):

image_size = params["image_size"]
target_size = params["target_size"]
max_upscaling_size = params["max_upscaling_size"]
expected_resized_size = params["expected_resized_size"]

image = torch.rand(3, *image_size) # Create a random image tensor

resized_image = resize_with_pad(
image=image,
target_size=target_size,
resample=torchvision.transforms.InterpolationMode["BILINEAR"],
max_upscaling_size=max_upscaling_size,
)

# assert everything beyond resize has value == 0
assert torch.all(
resized_image[:, (expected_resized_size[0] + 1) :, :] == 0
), "Expected everything beyond resize to be pad with fill=0"

assert torch.all(
resized_image[:, :, (expected_resized_size[1] + 1) :] == 0
), "Expected everything beyond resize to be pad with fill=0"

assert torch.all(
resized_image[:, : expected_resized_size[0], : expected_resized_size[1]]
!= 0
), "Expected no padding where the image is supposed to be"

# output should have shape target_size
assert (
resized_image.shape[-2:] == target_size
), f"Expected output with shape {target_size} but got {resized_image.shape[-2:]}"
Loading
Loading