From 093a6382aec0022480c5a390692bb3bac6dd8a11 Mon Sep 17 00:00:00 2001 From: Jonathan Swartz Date: Wed, 11 Dec 2024 15:47:23 +1300 Subject: [PATCH] fVDB: Added 'projects' directory to include more fully-featured example 'batteries included' projects Added panoptic segmentation project with MaskPLS implementation Fixed install issue not including `fvdb.optim` Re-introduced support for Volta&Turing but added errors for use of certain conv backends Added interpolation functions to VDBTensor Signed-off-by: Jonathan Swartz --- fvdb/Dockerfile | 2 +- fvdb/README.md | 49 +- fvdb/ci/main.sh | 3 + fvdb/examples/3dgs/accumulate_depths.py | 4 +- fvdb/examples/3dgs/download_example_data.py | 3 + fvdb/examples/3dgs/evaluate_colmap.py | 3 + .../3dgs/make_segmentation_dataset.py | 3 + fvdb/examples/3dgs/resume_colmap.py | 3 + fvdb/examples/3dgs/train_colmap.py | 3 + fvdb/examples/3dgs/train_segmentation.py | 5 +- fvdb/examples/3dgs/utils.py | 3 + fvdb/examples/3dgs/viz.py | 3 + fvdb/fvdb/nn/modules.py | 18 +- fvdb/fvdb/nn/vdbtensor.py | 20 +- fvdb/fvdb/types.py | 7 +- .../fvdb/utils/data/_colmap_utils/__init__.py | 3 + fvdb/fvdb/utils/data/_colmap_utils/camera.py | 2 + .../fvdb/utils/data/_colmap_utils/database.py | 4 +- fvdb/fvdb/utils/data/_colmap_utils/image.py | 3 +- .../fvdb/utils/data/_colmap_utils/rotation.py | 2 + .../utils/data/_colmap_utils/scene_manager.py | 2 + .../data/_colmap_utils/tools/colmap_to_nvm.py | 6 +- .../data/_colmap_utils/tools/delete_images.py | 6 +- .../tools/impute_missing_cameras.py | 6 +- .../tools/save_cameras_as_ply.py | 6 +- .../_colmap_utils/tools/transform_model.py | 4 + .../tools/write_camera_track_to_bundler.py | 6 +- .../tools/write_depthmap_to_ply.py | 3 + fvdb/fvdb/utils/tests/grid_utils.py | 3 + .../panoptic_segmentation/mask_pls/README.md | 40 ++ .../mask_pls/data/__init__.py | 9 + .../mask_pls/data/cache.py | 39 ++ .../mask_pls/data/collation.py | 96 ++++ .../mask_pls/data/e57.py | 80 +++ .../mask_pls/data/semanticKITTI.py | 484 ++++++++++++++++++ .../mask_pls/maskpls_environment.yml | 23 + .../mask_pls/models/__init__.py | 4 + .../mask_pls/models/mask_pls/__init__.py | 3 + .../mask_pls/models/mask_pls/backbone.py | 297 +++++++++++ .../mask_pls/models/mask_pls/blocks.py | 214 ++++++++ .../mask_pls/models/mask_pls/decoder.py | 212 ++++++++ .../mask_pls/models/mask_pls/loss.py | 204 ++++++++ .../mask_pls/models/mask_pls/mask_model.py | 117 +++++ .../models/mask_pls/positional_encoder.py | 56 ++ .../mask_pls/models/mask_pls/utils.py | 39 ++ .../panoptic_segmentation/mask_pls/train.py | 129 +++++ fvdb/setup.py | 9 +- fvdb/src/detail/build/Build.h | 2 +- fvdb/src/detail/build/CoarseFromFine.cpp | 2 +- fvdb/src/detail/build/ConvGrid.cpp | 2 +- fvdb/src/detail/build/DenseGrid.cpp | 2 +- fvdb/src/detail/build/EmptyGrid.cpp | 2 +- fvdb/src/detail/build/FineFromCoarse.cpp | 2 +- fvdb/src/detail/build/FromMesh.cpp | 2 +- .../build/NearestNeighborGridFromPoints.cpp | 2 +- .../src/detail/build/PaddedGridFromCoords.cpp | 2 +- fvdb/src/detail/build/PaddedGridFromGrid.cpp | 2 +- .../src/detail/build/PaddedGridFromPoints.cpp | 2 +- 58 files changed, 2199 insertions(+), 63 deletions(-) create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/README.md create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/data/__init__.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/data/cache.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/data/collation.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/data/e57.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/data/semanticKITTI.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/__init__.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/__init__.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/backbone.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/blocks.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/decoder.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/loss.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/mask_model.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/positional_encoder.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/utils.py create mode 100644 fvdb/projects/panoptic_segmentation/mask_pls/train.py diff --git a/fvdb/Dockerfile b/fvdb/Dockerfile index 7bf041e68b..816dd189c2 100644 --- a/fvdb/Dockerfile +++ b/fvdb/Dockerfile @@ -12,6 +12,6 @@ RUN pip install --no-cache-dir -r env/build_requirements.txt RUN if [ "$MODE" = "production" ]; then \ MAX_JOBS=$(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}') \ - TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9+PTX" \ + TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" \ python setup.py install; \ fi \ No newline at end of file diff --git a/fvdb/README.md b/fvdb/README.md index c2ed6bc7fc..002f9735c9 100644 --- a/fvdb/README.md +++ b/fvdb/README.md @@ -29,7 +29,7 @@ During the project's initial stage of release, it is necessary to [run the build ** Notes:** * Linux is the only platform currently supported (Ubuntu >= 20.04 recommended). -* A CUDA-capable GPU with Ampere architecture or newer (i.e. compute capability >=8.0) is required to run the CUDA-accelerated operations in ƒVDB. +* A CUDA-capable GPU with Ampere architecture or newer (i.e. compute capability >=8.0) is recommended to run the CUDA-accelerated operations in ƒVDB. A GPU with compute capabililty >=7.0 (Volta architecture) is the minimum requirement but some operations and data types are not supported. ## Building *f*VDB from Source @@ -60,7 +60,7 @@ docker build --build-arg MODE=dev -t fvdb/dev . Running the docker container is done with the following command: ```shell # Run an interactive bash shell (or replace with your command) -docker run -it --gpus all --rm \ +docker run -it --ipc=host --gpus all --rm \ fvdb/dev:latest \ /bin/bash ``` @@ -68,57 +68,42 @@ docker run -it --gpus all --rm \ When running the docker container in `dev` mode and when you are ready to build ƒVDB, you can run the following command to build ƒVDB for the recommended set of CUDA architectures: ```shell MAX_JOBS=$(free -g | awk '/^Mem:/{jobs=int($4/2.5); if(jobs<1) jobs=1; print jobs}') \ - TORCH_CUDA_ARCH_LIST="8.0;8.6;8.9+PTX" \ + TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6+PTX" \ python setup.py install ``` #### Setting up a Conda Environment -In order to get resolved package versions in your conda environment consistent with our testing, it is necessary to configure your `.condarc` since not all package resolving behaviour can be controlled with an `environment.yml` file. We recommend using `strict` channel priority in your conda configuration. This can be done by running the following command: +*f*VDB can be used with any Conda distribution. Below is an installation guide using +[miniforge](https://github.com/conda-forge/miniforge). You can skip steps 1-3 if you already have a Conda installation. -```shell -conda config --set channel_priority strict -``` - -Further, it is recommend to not mix the `defaults` and `conda-forge` package channels when resolving environments. We have generally used `conda-forge` as the primary channel for our dependencies. You can remove the `defaults` channel and add `conda-forge` with the following command: +1. Download and Run Install Script. Copy the command below to download and run the [miniforge install script](https://github.com/conda-forge/miniforge?tab=readme-ov-file#unix-like-platforms-macos--linux): ```shell -conda config --remove channels defaults -conda config --add channels conda-forge -``` - -With these changes, it is recommended that your `.condarc` file looks like the following: - -```yaml -channel_priority: strict -channels: - - conda-forge +curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh" +bash Miniforge3-$(uname)-$(uname -m).sh ``` +2. Follow the prompts to customize Conda and run the install. Note, we recommend saying yes to enable `conda-init`. -**(Optional) Install libMamba for a huge quality of life improvement when using Conda** -``` -conda update -n base conda -conda install -n base conda-libmamba-solver -conda config --set solver libmamba -``` +3. Start Conda. Open a new terminal window, which should now show Conda initialized to the `(base)` environment. +4. Create the `fvdb` conda environment. Run the following command from the root of this repository: -Next, create the `fvdb` conda environment by running the following command from the root of this repository, and then grabbing a ☕: ```shell conda env create -f env/dev_environment.yml ``` -**Notes:** -* You can optionally use the `env/build_environment.yml` environment file if you want a minimum set of dependencies needed just to build/package *f*VDB (note this environment won't have all the runtime dependencies needed to `import fvdb`). -* If you would like a runtime environment which has only the packages required to run the unit tests after building ƒVDB, you can use the `env/test_environment.yml`. This is the environment used by the CI pipeline to run the tests after building ƒVDB in the `fvdb_build` environment. -* Use the `fvdb_learn` environment defined in `env/learn_environment.yml` if you would like an environment with the runtime requirements and the additional packages needed to run the [notebooks](notebooks) or [examples](examples) and view their visualizations. +5. Activate the *f*VDB environment: -Now activate the environment: ```shell conda activate fvdb ``` +#### Other available environments +* `fvdb_build`: Use `env/build_environment.yml` for a minimum set of dependencies needed just to build/package *f*VDB (note this environment won't have all the runtime dependencies needed to `import fvdb`). +* `fvdb_test`: Use `env/test_environment.yml` for a runtime environment which has only the packages required to run the unit tests after building ƒVDB. This is the environment used by the CI pipeline to run the tests after building ƒVDB in the `fvdb_build` environment. +* `fvdb_learn`: Use `env/learn_environment.yml` for additional runtime requirements and packages needed to run the [notebooks](notebooks) or [examples](examples) and view their visualizations. ### Building *f*VDB @@ -254,4 +239,4 @@ Please consider citing this when using *f*VDB in a project. You can use the cita } ``` -## Contact \ No newline at end of file +## Contact diff --git a/fvdb/ci/main.sh b/fvdb/ci/main.sh index de1c8deb8b..ef88f151c6 100644 --- a/fvdb/ci/main.sh +++ b/fvdb/ci/main.sh @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# # Running a Github Action Runner, the first argument # Starting a dockerd diff --git a/fvdb/examples/3dgs/accumulate_depths.py b/fvdb/examples/3dgs/accumulate_depths.py index a379f25256..d195ef5653 100644 --- a/fvdb/examples/3dgs/accumulate_depths.py +++ b/fvdb/examples/3dgs/accumulate_depths.py @@ -1,4 +1,6 @@ -import matplotlib.pyplot as plt +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import numpy as np import point_cloud_utils as pcu import torch diff --git a/fvdb/examples/3dgs/download_example_data.py b/fvdb/examples/3dgs/download_example_data.py index e244011218..504b03db7a 100644 --- a/fvdb/examples/3dgs/download_example_data.py +++ b/fvdb/examples/3dgs/download_example_data.py @@ -1,4 +1,7 @@ #!/usr/bin/env python +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# """Script to download benchmark dataset(s)""" import os diff --git a/fvdb/examples/3dgs/evaluate_colmap.py b/fvdb/examples/3dgs/evaluate_colmap.py index 347c5414ab..a27ea408f8 100644 --- a/fvdb/examples/3dgs/evaluate_colmap.py +++ b/fvdb/examples/3dgs/evaluate_colmap.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import time from typing import Optional, Union diff --git a/fvdb/examples/3dgs/make_segmentation_dataset.py b/fvdb/examples/3dgs/make_segmentation_dataset.py index 3997eedbf7..a3f0ba0e3b 100644 --- a/fvdb/examples/3dgs/make_segmentation_dataset.py +++ b/fvdb/examples/3dgs/make_segmentation_dataset.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import torch import torch.utils.data import tqdm diff --git a/fvdb/examples/3dgs/resume_colmap.py b/fvdb/examples/3dgs/resume_colmap.py index 8c2b736b6a..2de123da7a 100644 --- a/fvdb/examples/3dgs/resume_colmap.py +++ b/fvdb/examples/3dgs/resume_colmap.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# from typing import Optional, Union import torch diff --git a/fvdb/examples/3dgs/train_colmap.py b/fvdb/examples/3dgs/train_colmap.py index 8259f4a7f3..44c7cf9279 100644 --- a/fvdb/examples/3dgs/train_colmap.py +++ b/fvdb/examples/3dgs/train_colmap.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import itertools import json import logging diff --git a/fvdb/examples/3dgs/train_segmentation.py b/fvdb/examples/3dgs/train_segmentation.py index 285d9b2a6a..32cc77c139 100644 --- a/fvdb/examples/3dgs/train_segmentation.py +++ b/fvdb/examples/3dgs/train_segmentation.py @@ -1,9 +1,12 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + import itertools import time from dataclasses import dataclass from typing import Union -import matplotlib.pyplot as plt import torch import tqdm import tyro diff --git a/fvdb/examples/3dgs/utils.py b/fvdb/examples/3dgs/utils.py index a2beafc161..30639e5989 100644 --- a/fvdb/examples/3dgs/utils.py +++ b/fvdb/examples/3dgs/utils.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import torch import torch.nn.functional as F diff --git a/fvdb/examples/3dgs/viz.py b/fvdb/examples/3dgs/viz.py index 7df6fdaeca..db3fd023b1 100644 --- a/fvdb/examples/3dgs/viz.py +++ b/fvdb/examples/3dgs/viz.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import dataclasses import os import sys diff --git a/fvdb/fvdb/nn/modules.py b/fvdb/fvdb/nn/modules.py index f4fa407c68..f9c9bb8d4d 100644 --- a/fvdb/fvdb/nn/modules.py +++ b/fvdb/fvdb/nn/modules.py @@ -268,11 +268,27 @@ def _dispatch_conv(self, in_feature, in_grid, in_kmap, out_grid): backend = self.backend + sm_arch = torch.cuda.get_device_capability()[0] + torch.cuda.get_device_capability()[1] / 10 + # tf32 requires compute capability >= 8.0 (Ampere) if self.allow_tf32 and self.weight.is_cuda: assert ( - torch.cuda.get_device_capability()[0] >= 8 + sm_arch >= 8 ), "TF32 requires GPU with compute capability >= 8.0. Please set fvdb.nn.SparseConv3d.allow_tf32 = False." + # bf16 requires compute capability >= 8.0 (Ampere) + if self.weight.is_cuda and self.weight.dtype == torch.bfloat16: + assert sm_arch >= 8, "BF16 requires GPU with compute capability >= 8.0." + + # float16 requires compute capability >= 7.5 (Turing) + if self.weight.is_cuda and self.weight.dtype == torch.float16: + assert sm_arch >= 7.5, "FP16 requires GPU with compute capability >= 7.5." + + # cutlass, lggs, halo backends require compute capability >= 8.0 (Ampere) + if backend in ["cutlass", "lggs", "halo"]: + assert ( + torch.cuda.get_device_capability()[0] >= 8 + ), "cutlass, LGGS and Halo backends require GPU with compute capability >= 8.0." + if backend == "cutlass" and ( (not self.weight.is_cuda) or (self.in_channels, self.out_channels) not in self.CUTLASS_SUPPORTED_CHANNELS ): diff --git a/fvdb/fvdb/nn/vdbtensor.py b/fvdb/fvdb/nn/vdbtensor.py index 3b65dff1c2..3afce8245d 100644 --- a/fvdb/fvdb/nn/vdbtensor.py +++ b/fvdb/fvdb/nn/vdbtensor.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 # from dataclasses import dataclass -from typing import Any, Optional, Union +from typing import Any, Optional, Tuple, Union import torch @@ -10,6 +10,8 @@ from fvdb import GridBatch, JaggedTensor, SparseConvPackInfo from fvdb.types import Vec3dBatch, Vec3dBatchOrScalar, Vec3i +JaggedTensorOrTensor = Union[torch.Tensor, JaggedTensor] + @dataclass class VDBTensor: @@ -204,6 +206,22 @@ def _binop_inplace(self, other, op): op(self.data, other) return self + # ----------------------- + # Interpolation functions + # ----------------------- + + def sample_bezier(self, points: JaggedTensorOrTensor) -> JaggedTensor: + return self.grid.sample_bezier(points, self.data) + + def sample_bezier_with_grad(self, points: JaggedTensorOrTensor) -> Tuple[JaggedTensor, JaggedTensor]: + return self.grid.sample_bezier_with_grad(points, self.data) + + def sample_trilinear(self, points: JaggedTensorOrTensor) -> JaggedTensor: + return self.grid.sample_trilinear(points, self.data) + + def sample_trilinear_with_grad(self, points: JaggedTensorOrTensor) -> Tuple[JaggedTensor, JaggedTensor]: + return self.grid.sample_trilinear_with_grad(points, self.data) + def cpu(self): return VDBTensor(self.grid.to("cpu"), self.data.cpu(), self.kmap.cpu() if self.kmap is not None else None) diff --git a/fvdb/fvdb/types.py b/fvdb/fvdb/types.py index 621b0857a7..3225fcc0d1 100644 --- a/fvdb/fvdb/types.py +++ b/fvdb/fvdb/types.py @@ -1,5 +1,10 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + from __future__ import annotations -from typing import List, Tuple, Union, Iterable + +from typing import Iterable, List, Tuple, Union import numpy import torch diff --git a/fvdb/fvdb/utils/data/_colmap_utils/__init__.py b/fvdb/fvdb/utils/data/_colmap_utils/__init__.py index 7044ce39dc..e58181ce74 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/__init__.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/__init__.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# from .camera import Camera from .database import COLMAPDatabase from .image import Image diff --git a/fvdb/fvdb/utils/data/_colmap_utils/camera.py b/fvdb/fvdb/utils/data/_colmap_utils/camera.py index e40eb3a5f8..6751b81cbc 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/camera.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/camera.py @@ -1,4 +1,6 @@ # Author: True Price +# SPDX-License-Identifier: Apache-2.0 +# import numpy as np from scipy.optimize import root diff --git a/fvdb/fvdb/utils/data/_colmap_utils/database.py b/fvdb/fvdb/utils/data/_colmap_utils/database.py index ec150ffbea..ce4142a84a 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/database.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/database.py @@ -1,4 +1,6 @@ -import os +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import sqlite3 import numpy as np diff --git a/fvdb/fvdb/utils/data/_colmap_utils/image.py b/fvdb/fvdb/utils/data/_colmap_utils/image.py index eb6a226832..25075698f4 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/image.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/image.py @@ -1,5 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 # Author: True Price - +# import numpy as np # ------------------------------------------------------------------------------- diff --git a/fvdb/fvdb/utils/data/_colmap_utils/rotation.py b/fvdb/fvdb/utils/data/_colmap_utils/rotation.py index 90ec9533bf..615c626a93 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/rotation.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/rotation.py @@ -1,4 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 # Author: True Price +# import numpy as np diff --git a/fvdb/fvdb/utils/data/_colmap_utils/scene_manager.py b/fvdb/fvdb/utils/data/_colmap_utils/scene_manager.py index 7cd2db9fd1..ac2c5065b5 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/scene_manager.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/scene_manager.py @@ -1,4 +1,6 @@ +# SPDX-License-Identifier: Apache-2.0 # Author: True Price +# import array import os diff --git a/fvdb/fvdb/utils/data/_colmap_utils/tools/colmap_to_nvm.py b/fvdb/fvdb/utils/data/_colmap_utils/tools/colmap_to_nvm.py index e7c8db5029..60bf416fa8 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/tools/colmap_to_nvm.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/tools/colmap_to_nvm.py @@ -1,10 +1,12 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + import itertools import sys sys.path.append("..") -import numpy as np - from .. import Quaternion, SceneManager # ------------------------------------------------------------------------------- diff --git a/fvdb/fvdb/utils/data/_colmap_utils/tools/delete_images.py b/fvdb/fvdb/utils/data/_colmap_utils/tools/delete_images.py index 80f9bc9a86..d9c0f36d7f 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/tools/delete_images.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/tools/delete_images.py @@ -1,9 +1,11 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + import sys sys.path.append("..") -import numpy as np - from .. import DualQuaternion, Image, SceneManager # ------------------------------------------------------------------------------- diff --git a/fvdb/fvdb/utils/data/_colmap_utils/tools/impute_missing_cameras.py b/fvdb/fvdb/utils/data/_colmap_utils/tools/impute_missing_cameras.py index f579fa8ac5..61cf958821 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/tools/impute_missing_cameras.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/tools/impute_missing_cameras.py @@ -1,9 +1,11 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + import sys sys.path.append("..") -import numpy as np - from .. import DualQuaternion, Image, SceneManager # ------------------------------------------------------------------------------- diff --git a/fvdb/fvdb/utils/data/_colmap_utils/tools/save_cameras_as_ply.py b/fvdb/fvdb/utils/data/_colmap_utils/tools/save_cameras_as_ply.py index 5e6c9e8eac..5679dcb208 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/tools/save_cameras_as_ply.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/tools/save_cameras_as_ply.py @@ -1,9 +1,11 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + import sys sys.path.append("..") -import os - import numpy as np from .. import SceneManager diff --git a/fvdb/fvdb/utils/data/_colmap_utils/tools/transform_model.py b/fvdb/fvdb/utils/data/_colmap_utils/tools/transform_model.py index ea398ef5cd..5fba7261ea 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/tools/transform_model.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/tools/transform_model.py @@ -1,3 +1,7 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + import sys sys.path.append("..") diff --git a/fvdb/fvdb/utils/data/_colmap_utils/tools/write_camera_track_to_bundler.py b/fvdb/fvdb/utils/data/_colmap_utils/tools/write_camera_track_to_bundler.py index 43a721c162..725cd27116 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/tools/write_camera_track_to_bundler.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/tools/write_camera_track_to_bundler.py @@ -1,9 +1,11 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# + import sys sys.path.append("..") -import numpy as np - from .. import SceneManager # ------------------------------------------------------------------------------- diff --git a/fvdb/fvdb/utils/data/_colmap_utils/tools/write_depthmap_to_ply.py b/fvdb/fvdb/utils/data/_colmap_utils/tools/write_depthmap_to_ply.py index 0dba35c870..edb55f7149 100644 --- a/fvdb/fvdb/utils/data/_colmap_utils/tools/write_depthmap_to_ply.py +++ b/fvdb/fvdb/utils/data/_colmap_utils/tools/write_depthmap_to_ply.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# import sys sys.path.append("..") diff --git a/fvdb/fvdb/utils/tests/grid_utils.py b/fvdb/fvdb/utils/tests/grid_utils.py index 0719239025..d1a4f4aafc 100644 --- a/fvdb/fvdb/utils/tests/grid_utils.py +++ b/fvdb/fvdb/utils/tests/grid_utils.py @@ -1,3 +1,6 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# from typing import Union import numpy as np diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/README.md b/fvdb/projects/panoptic_segmentation/mask_pls/README.md new file mode 100644 index 0000000000..4105aa6011 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/README.md @@ -0,0 +1,40 @@ +# Mask-PLS: Panoptic LiDAR Segmentation + +This project implements [Mask-PLS (Mask-Based Panoptic LiDAR Segmentation)](https://github.com/PRBonn/MaskPLS) for panoptic LiDAR segmentation using fVDB. The model performs both semantic segmentation and instance segmentation of LiDAR point clouds. + +## Requirements + +Build an environment with the required dependencies for this project and install the `fVDB` package from a built wheel: + +```bash +conda env create -f maskpls_envrionment.yml +conda activate maskpls +pip install /path/to/fVDB/dist/fvdb-0.2.0-cp312-cp312-linux_x86_64.whl # Replace with the correct wheel +``` + +## Usage + +A basic example of training the model is contained in `train.py`. The script can be run with the following command: + +```bash +python train.py --dataset-type SemanticKITTI \ + --dataset-path /path/to/datasets/SemanticKITTI \ + --dataset-spatial-normalization 82 80 32 # Magnitude of the spatial extents of the dataset for normalization +``` + +## Model Architecture + +- **MaskPLS**: The main model class that implements the full architecture with sub-modules: + - `MaskPLSEncoderDecoder` semantic segmentation head + - Optional masked transformer decoder `MaskedTransformerDecoder` for instance segmentation + + +## Supported Datasets + +- SemanticKITTI: Standard automotive LiDAR dataset +- E57: Generic point cloud format (random labels for testing) + + +## References + +Based on the MaskPLS paper: [MaskPLS: Mask-Based Panoptic LiDAR Segmentation](https://www.ipb.uni-bonn.de/wp-content/papercite-data/pdf/marcuzzi2023ral.pdf) \ No newline at end of file diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/data/__init__.py b/fvdb/projects/panoptic_segmentation/mask_pls/data/__init__.py new file mode 100644 index 0000000000..8e5fe9795c --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/data/__init__.py @@ -0,0 +1,9 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from .collation import ( + SemanticSegmentationDatasetCollation, + fVDBSemanticSegmentationDatasetCollation, +) +from .e57 import E57Dataset +from .semanticKITTI import SemanticKITTIDataset diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/data/cache.py b/fvdb/projects/panoptic_segmentation/mask_pls/data/cache.py new file mode 100644 index 0000000000..b3ea11676d --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/data/cache.py @@ -0,0 +1,39 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +import tempfile +from pathlib import Path + +import torch +from torch.utils.data import Dataset + + +class CachedDataset(Dataset): + def __init__(self, original_dataset, cache_dir=tempfile.gettempdir()): + self.original_dataset = original_dataset + if cache_dir is None: + raise ValueError("cache_dir cannot be None") + cache_dir = ( + Path(cache_dir) + / self.original_dataset.__class__.__name__ + / (str(self.original_dataset.get_directory()).lstrip("/")) + ) + if not cache_dir.exists(): + cache_dir.mkdir(parents=True) + self.cache_dir = cache_dir + + def __getitem__(self, index): + if (self.cache_dir / f"{index}.pt").exists(): + return torch.load( + f"{self.cache_dir}/{index}.pt", + ) + + item = self.original_dataset[index] + torch.save(item, f"{self.cache_dir}/{index}.pt") + return item + + def __len__(self): + return len(self.original_dataset) + + def __getattr__(self, name): + return getattr(self.original_dataset, name) diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/data/collation.py b/fvdb/projects/panoptic_segmentation/mask_pls/data/collation.py new file mode 100644 index 0000000000..739ac13763 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/data/collation.py @@ -0,0 +1,96 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from typing import Dict + +import torch + +import fvdb + + +class SemanticSegmentationDatasetCollation: + """ + A data collation class for semantic segmentation datasets. + This class handles the collation of various data components used in semantic + segmentation tasks, including point cloud coordinates, labels, and auxiliary data. + Attributes: + keys (list): List of strings representing the expected data keys: + - "xyz": Point cloud coordinates + - "voxel_size": Voxel dimensions + - "semantic_labels": Semantic class labels + - "instance_labels": Instance segmentation labels + - "intensity": Point intensity values + - "semantic_embeddings": Semantic feature embeddings + - "cam_images": Camera images + Methods: + __call__(data): Collates the input data into a dictionary format. + Args: + data: A sequence of data samples to be collated. + Returns: + Dict: A dictionary where each key corresponds to a list of the respective data components. + """ + + def __init__(self): + self.keys = [ + "xyz", + "voxel_size", + "semantic_labels", + "instance_labels", + "intensity", + "semantic_embeddings", + "cam_images", + ] + + def __call__(self, data) -> Dict: + return {self.keys[i]: list(x) for i, x in enumerate(zip(*data))} + + +class fVDBSemanticSegmentationDatasetCollation: + """ + A data collation class for semantic segmentation datasets using fVDB. + This class handles the conversion of point cloud data into VDBTensors. + Parameters + ---------- + device : torch.device, optional + The device on which to perform computations (default is "cuda:0") + Methods + ------- + __call__(data: dict) -> dict: + Transforms input point cloud data into VDBTensors. + Parameters: + data (dict): Dictionary containing: + - xyz: List of point coordinates in world space + - voxel_size: List of voxel dimensions + - intensity: List of intensity values for points + Returns: + dict: Original dictionary updated with: + - xyz: JaggedTensor of point coordinates + - vdbtensor: VDBTensor containing the structured volumetric data + """ + + def __init__(self, device=torch.device("cuda:0")): + super().__init__() + self.device = device + + def __call__(self, data): + # xyz world space point positions + data["xyz"] = fvdb.JaggedTensor([torch.tensor(c, device=self.device) for c in data["xyz"]]) + + grid = fvdb.gridbatch_from_points(data["xyz"], voxel_sizes=[n.tolist() for n in data["voxel_size"]]) + + # get mapping of the coordinates to the grid for feature mapping + coord_ijks = grid.world_to_grid(data["xyz"]).round().int() + inv_idx = grid.ijk_to_inv_index(coord_ijks, cumulative=True) + + # assert(torch.all(grid.ijk.jdata == coord_ijks.jdata[inv_idx.jdata])) + + jfeats = torch.cat([torch.tensor(f, device=self.device).unsqueeze(-1) for f in data["intensity"]]) + jfeats = grid.jagged_like(jfeats[inv_idx.jdata]) + + jfeats = fvdb.jcat([grid.ijk.float(), jfeats], dim=1) + + vdbtensor = fvdb.nn.VDBTensor(grid, jfeats) + + data["vdbtensor"] = vdbtensor + + return data diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/data/e57.py b/fvdb/projects/panoptic_segmentation/mask_pls/data/e57.py new file mode 100644 index 0000000000..180817481c --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/data/e57.py @@ -0,0 +1,80 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +import logging +from pathlib import Path +from typing import List, Union + +import numpy as np +import pye57 +from torch.utils.data import Dataset + +__all__ = [ + "E57Dataset", +] + + +class E57Dataset(Dataset): + logger = logging.getLogger("E57Dataset") + + def __init__( + self, + directory: Path, + cam_image_names: List = [ + "image_2", + ], + voxel_size: Union[float, np.ndarray] = 0.1, + ): + """This class handles E57 files and associated camera images for point cloud processing. + Args: + directory (Path): Path to the directory containing E57 files + cam_image_names (List, optional): List of camera image folder names. Defaults to ["image_2"] + voxel_size (Union[float, np.ndarray], optional): Size of voxels for point cloud discretization. + Can be a single float or array. Defaults to 0.1 + """ + self._directory = directory + self._cam_image_names = cam_image_names + self._voxel_size = voxel_size + + # Hardcoded for now + self.num_classes = 20 + self.ignore_classes = [0] + + self._e57_files = list(directory.glob("*.e57")) + self.logger.info(f"Found {len(self._e57_files)} e57 files in {directory}") + + def get_directory(self): + return self._directory + + def __len__(self): + return len(self._e57_files) + + def __getitem__(self, idx): + if idx >= len(self): + raise IndexError(f"Index {idx} out of range {len(self)}") + + e57_file = self._e57_files[idx] + self.logger.debug(f"Loading {e57_file}") + + with pye57.E57(str(e57_file)) as e57: + scan = e57.read_scan(0, intensity=True, colors=False, transform=True) + xyz = np.stack([scan["cartesianX"], scan["cartesianY"], scan["cartesianZ"]], axis=-1) + + intensity = scan["intensity"] + + # color = np.stack([scan["colorRed"], scan["colorGreen"], scan["colorBlue"]], axis=-1) + + # TODO: Determine a better way to set the voxel size. Should this be a user param? Dataset metadata? + voxel_size = np.array(self._voxel_size) + if voxel_size.size not in [1, 3]: + raise ValueError(f"Invalid voxel size: {voxel_size}, must be a scalar or 3-element array") + if voxel_size.size == 1: + voxel_size = voxel_size.repeat(3) + + # instance labels just as random values as placeholder + semantic_labels = np.random.randint(0, self.num_classes, xyz[:, 0].shape, dtype=np.uint32) + + # TODO: Color data, image data, etc. + # xyz, voxel_size, semantic_labels, instance_labels, intensity, semantic_embeddings, cam_images + self.logger.debug(f"Returning data for {e57_file}") + return (xyz, voxel_size, semantic_labels, None, intensity, None, None) diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/data/semanticKITTI.py b/fvdb/projects/panoptic_segmentation/mask_pls/data/semanticKITTI.py new file mode 100644 index 0000000000..8a4b73463e --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/data/semanticKITTI.py @@ -0,0 +1,484 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from __future__ import annotations + +import itertools +import logging +from collections import namedtuple +from pathlib import Path +from typing import Dict, List, NamedTuple, Optional, Tuple, Union + +import numpy as np +import torch +import yaml +from PIL import Image +from torch.utils.data import Dataset + +__all__ = [ + "SemanticKITTIDataset", + "SemanticKITTIDataItem", + "SemanticKITTIDataSequence", +] + + +class SemanticKITTIDataItem: + def __init__( + self, + parent_sequence: SemanticKITTIDataSequence, + velodyne_file: Path, + calibration_data: NamedTuple, + cam_image_names: List = [], + label_file: Optional[Path] = None, + ) -> None: + """This class represents a single frame from the SemanticKITTI dataset, containing LiDAR point cloud, + optional semantic labels, calibration data, and associated camera images. + + Args: + parent_sequence (SemanticKITTIDataSequence): Parent sequence object containing this frame + velodyne_file (Path): Path to the velodyne point cloud file + calibration_data (NamedTuple): Calibration data for sensors + cam_image_names (List, optional): List of camera names to load images from. Defaults to empty list. + label_file (Optional[Path], optional): Path to semantic label file. Defaults to None. + + Raises: + FileNotFoundError: If velodyne file, label file, or any camera image file does not exist + """ + self._parent_sequence = parent_sequence + if not velodyne_file.exists(): + raise FileNotFoundError(f"Velodyne file {velodyne_file} not found") + self._velodyne_file = velodyne_file + if label_file and not label_file.exists(): + raise FileNotFoundError(f"Label file {label_file} not found") + self._label_file = label_file + + self._calib = calibration_data + self._cam_image_names = cam_image_names + + self._cam_image_files = [] + for cam_name in cam_image_names: + self._cam_image_files.append( + self._velodyne_file.parent.parent / cam_name / self._velodyne_file.with_suffix(".png").name + ) + if not self._cam_image_files[-1].exists(): + raise FileNotFoundError(f"Image file {self._cam_image_files[-1]} not found") + + def __repr__(self): + return f"SemanticKITTIDataItem({self._velodyne_file})" + + def get_parent_sequence(self) -> SemanticKITTIDataSequence: + return self._parent_sequence + + def has_labels(self) -> bool: + return self._label_file is not None + + def has_semantic_embeddings(self) -> bool: + if self._label_file: + if self._label_file.with_suffix(".pth").exists(): + return True + return False + + def get_semantic_embeddings(self) -> Dict: + if self.has_semantic_embeddings(): + semantic_embeddings = torch.load(self._label_file.with_suffix(".pth")) + semantic_embeddings["ins"] = semantic_embeddings["ins"].numpy() + semantic_embeddings["ins"] = 10 + ((semantic_embeddings["ins"] << 16) & 0xFFFF0000) + + embeddings_norm = semantic_embeddings["embeds"].norm(dim=-1, keepdim=True) + semantic_embeddings["embeds"] /= embeddings_norm + return semantic_embeddings + + else: + raise ValueError(f"No semantic embeddings found for {self}") + + def get_labels(self) -> Tuple[np.ndarray, np.ndarray]: + if not self.has_labels(): + raise ValueError(f"No labels found for {self}") + else: + label_data = np.fromfile(self._label_file, dtype=np.uint32) + + semantic_labels = label_data & 0xFFFF + instance_labels = label_data >> 16 + return semantic_labels, instance_labels + + def get_points_intensities(self) -> Tuple[np.ndarray, np.ndarray]: + # points + point_data = np.fromfile(self._velodyne_file, dtype=np.float32) + point_data = point_data.reshape((-1, 4)) + + xyz = point_data[:, :3] + + # intensities + intensity = point_data[:, 3] + if len(intensity.shape) == 2: + intensity = np.squeeze(intensity) + return xyz, intensity + + def get_cam_images(self, xyz: np.ndarray) -> Dict: + # load images + # + cam_images = { + "cam_names": self._cam_image_files, + "imgs": [], + "points_in_img": [], + "points_in_img_indices": [], + } + if len(self._cam_image_files) > 0: + imgs = [Image.open(f) for f in self._cam_image_files] + cam_images["imgs"] = [np.array(img) for img in imgs] + + for cam_name, img in zip(self._cam_image_names, cam_images["imgs"]): + xform_lidar_to_cam = getattr(self._calib, f"Tr_{cam_name}") + # add ones to the end of the points + xyz = np.hstack((xyz[:, :3], np.ones((xyz.shape[0], 1)))) + xyz_camera_space = np.dot(xyz, xform_lidar_to_cam.T) + valid_points_in_img = xyz_camera_space[:, 2] > 0 + + points_in_img = getattr(self._calib, f"P_{cam_name}").dot(xyz_camera_space.T).T + points_in_img /= points_in_img[:, 2].reshape(-1, 1) + points_in_img = points_in_img.astype(int)[:, :2] + + # valid points in image + width, height, _ = img.shape + valid_points_in_img = np.logical_and(valid_points_in_img, 0 <= points_in_img[:, 0]) + valid_points_in_img = np.logical_and(valid_points_in_img, points_in_img[:, 0] < height) + valid_points_in_img = np.logical_and(valid_points_in_img, 0 <= points_in_img[:, 1]) + valid_points_in_img = np.logical_and(valid_points_in_img, points_in_img[:, 1] < width) + + points_in_img = points_in_img[valid_points_in_img] + points_in_img_indices = np.where(valid_points_in_img)[0] + + cam_images["points_in_img"].append(points_in_img) + cam_images["points_in_img_indices"].append(points_in_img_indices) + return cam_images + + def get_points_intensities_cam_images(self, index): + xyz, intensity = self.get_points_intensities() + cam_images = self.get_cam_images(xyz) + return xyz, intensity, cam_images + + +class SemanticKITTIDataSequence: + def __init__( + self, + parent_dataset: SemanticKITTIDataset, + sequence_directory: Path, + cam_image_names: List = [], + labels_name: Optional[str] = "", + ) -> None: + """SemanticKITTIDataSequence represents a sequence sub-directory in the SemanticKITTI dataset. + + Args: + parent_dataset (SemanticKITTIDataset): The parent dataset instance this sequence belongs to + sequence_directory (Path): Path to the sequence directory containing velodyne data, labels etc. + cam_image_names (List, optional): List of camera image names to load. Defaults to empty list. + labels_name (Optional[str], optional): Name of the labels directory. Defaults to empty string. + + Returns: + None + """ + self._parent = parent_dataset + self._sequence_directory = sequence_directory + self._velodyne = sequence_directory / "velodyne" + self._labels = (sequence_directory / labels_name) if labels_name else None + + self._calib_path = sequence_directory / "calib.txt" + + self._cam_image_names = cam_image_names + + self.setup() + + def __len__(self): + return len(self._data_items) + + def get_parent_dataset(self): + return self._parent + + def setup(self): + # read calibration data from calib.txt + self.read_calib() + + # populate all the inidividual data items + def populate_data_items(): + for velodyne_file in sorted(self._velodyne.glob("*.bin")): + label_file = None + if self._labels: + label_file = self._labels / velodyne_file.name.replace("bin", "label") + yield SemanticKITTIDataItem( + self, + velodyne_file, + calibration_data=self.calib, + label_file=label_file, + cam_image_names=self._cam_image_names, + ) + + self._data_items = list(populate_data_items()) + + def read_calib(self) -> None: + """Load and compute intrinsic and extrinsic calibration parameters.""" + with self._calib_path.open("r") as f: + lines = f.readlines() + + filedata = {} + for line in lines: + key, value = line.strip().split(":", 1) + filedata[key] = np.array([float(x) for x in value.split()]) + + # We'll build the calibration parameters as a dictionary, then + # convert it to a namedtuple to prevent it from being modified later + data = {} + + # Create 3x4 projection matrices + P_rect_00 = np.reshape(filedata["P0"], (3, 4)) + P_rect_10 = np.reshape(filedata["P1"], (3, 4)) + P_rect_20 = np.reshape(filedata["P2"], (3, 4)) + P_rect_30 = np.reshape(filedata["P3"], (3, 4)) + + data["P_rect_00"] = P_rect_00 + data["P_rect_10"] = P_rect_10 + data["P_rect_20"] = P_rect_20 + data["P_rect_30"] = P_rect_30 + + # Compute the rectified extrinsics from cam0 to camN + T1 = np.eye(4) + T1[0, 3] = P_rect_10[0, 3] / P_rect_10[0, 0] + T2 = np.eye(4) + T2[0, 3] = P_rect_20[0, 3] / P_rect_20[0, 0] + T3 = np.eye(4) + T3[0, 3] = P_rect_30[0, 3] / P_rect_30[0, 0] + + # Compute the velodyne to rectified camera coordinate transforms + data["T_cam0_velo"] = np.reshape(filedata["Tr"], (3, 4)) + data["T_cam0_velo"] = np.vstack([data["T_cam0_velo"], [0, 0, 0, 1]]) + data["T_cam1_velo"] = T1.dot(data["T_cam0_velo"]) + data["T_cam2_velo"] = T2.dot(data["T_cam0_velo"]) + data["T_cam3_velo"] = T3.dot(data["T_cam0_velo"]) + + # Compute the camera intrinsics + data["K_cam0"] = P_rect_00[0:3, 0:3] + data["K_cam1"] = P_rect_10[0:3, 0:3] + data["K_cam2"] = P_rect_20[0:3, 0:3] + data["K_cam3"] = P_rect_30[0:3, 0:3] + + # Compute the stereo baselines in meters by projecting the origin of + # each camera frame into the velodyne frame and computing the distances + # between them + p_cam = np.array([0, 0, 0, 1]) + p_velo0 = np.linalg.inv(data["T_cam0_velo"]).dot(p_cam) + p_velo1 = np.linalg.inv(data["T_cam1_velo"]).dot(p_cam) + p_velo2 = np.linalg.inv(data["T_cam2_velo"]).dot(p_cam) + p_velo3 = np.linalg.inv(data["T_cam3_velo"]).dot(p_cam) + + data["b_gray"] = np.linalg.norm(p_velo1 - p_velo0) # gray baseline + data["b_rgb"] = np.linalg.norm(p_velo3 - p_velo2) # rgb baseline + + data["P_image_0"] = data["P_rect_00"] + data["P_image_1"] = data["P_rect_10"] + data["P_image_2"] = data["P_rect_20"] + data["P_image_3"] = data["P_rect_30"] + + data["Tr_image_0"] = data["T_cam0_velo"] + data["Tr_image_1"] = data["T_cam0_velo"] + data["Tr_image_2"] = data["T_cam0_velo"] + data["Tr_image_3"] = data["T_cam0_velo"] + # data["Tr_image_1"] = data["T_cam1_velo"] + # data["Tr_image_2"] = data["T_cam2_velo"] + # data["Tr_image_3"] = data["T_cam3_velo"] + + data["K_image_0"] = data["K_cam0"] + data["K_image_1"] = data["K_cam1"] + data["K_image_2"] = data["K_cam2"] + data["K_image_3"] = data["K_cam3"] + + self.calib = namedtuple("CalibData", data.keys())(*data.values()) + + def get_data_item(self, idx): + return self._data_items[idx] + + +class SemanticKITTIDataset(Dataset): + logger = logging.getLogger("SemanticKITTIDataset") + + def __init__( + self, + directory: Path, + format: str = "KITTI", + labels_name="labels", + cam_image_names: List = [ + "image_2", + ], + split: str = "train", + camera_frustum_filter: bool = False, + unlabelled_filter: bool = False, + voxel_size: Union[float, np.ndarray] = 0.1, + ): + """SemanticKITTI dataset. + + Args: + directory (Path): Root directory path containing dataset. + format (str, optional): Dataset format. Defaults to "KITTI". + labels_name (str, optional): Name of labels directory. Defaults to "labels". + cam_image_names (List, optional): List of camera image directory names. Defaults to ["image_2"]. + split (str, optional): Dataset split ('train', 'val', 'test') as defined in KITTI dataset config. Defaults to "train". + camera_frustum_filter (bool, optional): Whether to filter points outside camera frustum. Defaults to False. + unlabelled_filter (bool, optional): Whether to filter unlabelled points. Defaults to False. + voxel_size (Union[float, np.ndarray], optional): Size of voxels for point cloud discretization. Defaults to 0.1. + + Raises: + ValueError: If more than one or no yaml configuration file is found in the dataset directory. + """ + self._root_directory = directory + self._format = format + self._labels_name = labels_name + self._cam_image_names = cam_image_names + self._split = split + self._camera_frustum_filter = camera_frustum_filter + self._unlabelled_filter = unlabelled_filter + self._voxel_size = voxel_size + + # load yaml config in dataset directory + yaml_files = list(directory.glob("*.yaml")) + if len(yaml_files) != 1: + raise ValueError(f"Expected exactly one yaml file in {directory}") + self._config = yaml.safe_load(yaml_files[0].open()) + + self.setup() + + def get_directory(self) -> Path: + return self._root_directory + + @property + def config(self) -> Dict: + return self._config + + def setup(self): + # setup from config + self.num_classes = len(self.config["learning_map_inv"]) + self.ignore_classes = [cl for cl, ignored in self.config["learning_ignore"].items() if ignored] + self.include_classes = [cl for cl, ignored in self.config["learning_ignore"].items() if not ignored] + + self.logger.info(f"Setting up '{self._split}' split of dataset {self._root_directory}") + # setup the sequences for this split + if self._split == "all": + self._data_sequences = [ + SemanticKITTIDataSequence( + self, + self._root_directory / "sequences" / f"{seq:02}", + cam_image_names=self._cam_image_names, + labels_name=self._labels_name, + ) + for seq in list(itertools.chain(*self.config["split"])) + ] + else: + self._data_sequences = [ + SemanticKITTIDataSequence( + self, + self._root_directory / "sequences" / f"{seq:02}", + cam_image_names=self._cam_image_names, + labels_name=self._labels_name, + ) + for seq in self.config["split"][ + self._split + ] # "split" in the config is a dictionary containing a list of sequence names for each split + ] + self.logger.info(f"Found {len(self._data_sequences)} sequences") + + def __len__(self) -> int: + return sum(len(seq) for seq in self._data_sequences) + + def __getitem__( + self, idx + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray, Union[np.ndarray, None], Dict]: + # TODO: Implement negative indexing + + # find the sequence that contains the item + data_item = None + for seq in self._data_sequences: + if idx < len(seq): + data_item = seq.get_data_item(idx) + break + idx -= len(seq) + if data_item is None: + raise IndexError("Index out of bounds") + ( + xyz, + intensity, + cam_images, + ) = data_item.get_points_intensities_cam_images(idx) + + # TODO: Determine a better way to set the voxel size. Should this be a user param? Dataset metadata? + voxel_size = np.array(self._voxel_size) + if voxel_size.size not in [1, 3]: + raise ValueError(f"Invalid voxel size: {voxel_size}, must be a scalar or 3-element array") + if voxel_size.size == 1: + voxel_size = voxel_size.repeat(3) + + if data_item.has_labels: + semantic_labels, instance_labels = data_item.get_labels() + else: + semantic_labels = np.zeros_like(xyz[:, 0], dtype=np.uint32) + instance_labels = np.zeros_like(xyz[:, 0], dtype=np.uint32) + + assert semantic_labels.shape[0] == instance_labels.shape[0] == xyz.shape[0], ( + f"Number of points ({xyz.shape[0]}) and " f"labels {semantic_labels.shape[0]} not the same for {self} " + ) + + semantic_embeddings = None + if data_item.has_semantic_embeddings(): + semantic_embeddings = data_item.get_semantic_embeddings() + + # things masks: semantic labels of thing instances + instance_semantic_labels = np.where(instance_labels == 0, 0, semantic_labels) + + # semantic + instance value + instance_labels = instance_semantic_labels + ((instance_labels << 16) & 0xFFFF0000) + semantic_labels = np.vectorize(self.config["learning_map"].__getitem__)(semantic_labels) + + # filter points outside of camera frustum + if self._camera_frustum_filter: + cam_frustum_indices = np.unique(np.concatenate(cam_images["points_in_img_indices"])) + + xyz = xyz[cam_frustum_indices] + semantic_labels = semantic_labels[cam_frustum_indices] + instance_labels = instance_labels[cam_frustum_indices] + intensity = intensity[cam_frustum_indices] + + points_in_img = cam_images["points_in_img"] + points_in_img_indices = cam_images["points_in_img_indices"] + + for i in range(len(points_in_img)): + _, intersec_indices_1, intersec_indices_2 = np.intersect1d( + points_in_img_indices[i], + cam_frustum_indices, + assume_unique=True, + return_indices=True, + ) + + points_in_img[i] = points_in_img[i][intersec_indices_1] + points_in_img_indices[i] = intersec_indices_2 + + # filter unlabelled points + if self._unlabelled_filter: + labelled_indices = np.where(semantic_labels != 0) + xyz = xyz[labelled_indices] + semantic_labels = semantic_labels[labelled_indices] + instance_labels = instance_labels[labelled_indices] + intensity = intensity[labelled_indices] + + return (xyz, voxel_size, semantic_labels, instance_labels, intensity, semantic_embeddings, cam_images) + + +if __name__ == "__main__": + import tyro + + logging.basicConfig(level=logging.INFO) + + def test_SemanticKITTIDataset(semanticKITTIPath: Path): + dataset = SemanticKITTIDataset(semanticKITTIPath, split="train") + print(f"Dataset length: {len(dataset)}") + print(f"Dataset config: {dataset.config}") + dataset_iter = iter(dataset) + print(f"Dataset item: {next(dataset_iter)}") + print(f"Dataset item: {next(dataset_iter)}") + + tyro.cli(test_SemanticKITTIDataset) diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml b/fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml new file mode 100644 index 0000000000..b13364e9af --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/maskpls_environment.yml @@ -0,0 +1,23 @@ +name: fvdb_maskpls +channels: + - pytorch + - nvidia + - conda-forge + - nodefaults +dependencies: + - python=3.10 + - pytorch::pytorch=2.4.0 + - pytorch::pytorch-cuda=12.1 + - pytorch::pytorch-mutex=*=cuda + - pip + - git + - gitpython + - ipython + - tqdm + - numpy<2 + - tyro + - scikit-learn + - py-opencv + - imageio + - pip: + - pye57 \ No newline at end of file diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/__init__.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/__init__.py new file mode 100644 index 0000000000..fb83f8b597 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/__init__.py @@ -0,0 +1,4 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from .mask_pls.mask_model import MaskPLS diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/__init__.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/__init__.py new file mode 100644 index 0000000000..6e140576d0 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/__init__.py @@ -0,0 +1,3 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/backbone.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/backbone.py new file mode 100644 index 0000000000..f1851e85d7 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/backbone.py @@ -0,0 +1,297 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from typing import List, Type + +import torch + +import fvdb +import fvdb.nn + +from .blocks import BasicConvolutionBlock, BasicDeconvolutionBlock, ResidualBlock + + +class fVDBSyncBatchNorm(torch.nn.Module): + r"""Applies Batch Normalization over a VDBTensor, synchronized across GPUs.""" + + def __init__( + self, + num_features, + eps=1e-5, + momentum=0.1, + affine=True, + track_running_stats=True, + process_group=None, + ): + torch.nn.Module.__init__(self) + self.bn = torch.nn.SyncBatchNorm( + num_features, + eps=eps, + momentum=momentum, + affine=affine, + track_running_stats=track_running_stats, + process_group=process_group, + ) + + def forward(self, input: fvdb.nn.VDBTensor) -> fvdb.nn.VDBTensor: + num_channels = input.data.jdata.size(1) + assert ( + num_channels == self.bn.num_features + ), "Input feature should have the same number of self.channels as BatchNorm" + result_data = self.bn(input.data.jdata) + return fvdb.nn.VDBTensor(input.grid, input.grid.jagged_like(result_data), input.kmap) + + @classmethod + def convert_sync_batchnorm(cls, module, process_group=None): + r"""Helper function to convert + :attr:`fvdb.nn.BatchNorm` layer in the model to + :attr:`fVDBSyncBatchNorm` layer. + + Args: + module (nn.Module): containing module + process_group (optional): process group to scope synchronization, + default is the whole world + + Returns: + The original module with the converted + :attr:`fvdb.nn.BatchNorm` layer + + Example:: + + >>> # Network with fvdb.nn.BatchNorm layer + >>> module = torch.nn.Sequential( + >>> torch.nn.Linear(20, 100), + >>> fvdb.nn.BatchNorm(100) + >>> ).cuda() + >>> # creating process group (optional) + >>> # process_ids is a list of int identifying rank ids. + >>> process_group = torch.distributed.new_group(process_ids) + >>> sync_bn_module = convert_sync_batchnorm(module, process_group) + + """ + module_output = module + if isinstance(module, fvdb.nn.BatchNorm): + module_output = fVDBSyncBatchNorm( + module.num_features, + module.eps, + module.momentum, # type: ignore + module.affine, + module.track_running_stats, + process_group, + ) + if module.affine: + with torch.no_grad(): + module_output.bn.weight = module.weight + module_output.bn.bias = module.bias + module_output.bn.running_mean = module.running_mean + module_output.bn.running_var = module.running_var + module_output.bn.num_batches_tracked = module.num_batches_tracked + if hasattr(module, "qconfig"): + module_output.bn.qconfig = module.qconfig + for name, child in module.named_children(): + module_output.add_module(name, cls.convert_sync_batchnorm(child, process_group)) + del module + return module_output + + +class MaskPLSEncoderDecoder(torch.nn.Module): + + channels = [32, 32, 64, 128, 256, 256, 128, 96, 96] + + def __init__( + self, + input_dim: int = 4, + stem_blocks: int = 1, + output_feature_levels: List[int] = [3], + conv_deconv_non_lin: Type = fvdb.nn.ReLU, + bn_momentum: float = 0.02, + ): + super().__init__() + self.output_feature_levels = output_feature_levels + down_res_blocks = [2, 3, 4, 6] + + self.stem = [ + fvdb.nn.SparseConv3d(input_dim, self.channels[0], kernel_size=3), + fvdb.nn.BatchNorm(self.channels[0], momentum=bn_momentum), + fvdb.nn.ReLU(inplace=True), + ] + for _ in range(1, stem_blocks): + self.stem.extend( + [ + fvdb.nn.SparseConv3d(self.channels[0], self.channels[0], kernel_size=3), + fvdb.nn.BatchNorm(self.channels[0], momentum=bn_momentum), + fvdb.nn.ReLU(inplace=True), + ] + ) + self.stem = torch.nn.Sequential(*self.stem) + + self.stage1 = [ + BasicConvolutionBlock( + self.channels[0], self.channels[0], ks=2, stride=2, bn_mom=bn_momentum, non_lin=conv_deconv_non_lin + ), + ResidualBlock(self.channels[0], self.channels[1], ks=3, bn_mom=bn_momentum), + ] + self.stage1.extend( + [ + ResidualBlock(self.channels[1], self.channels[1], ks=3, bn_mom=bn_momentum) + for _ in range(1, down_res_blocks[0]) + ] + ) + self.stage1 = torch.nn.Sequential(*self.stage1) + + self.stage2 = [ + BasicConvolutionBlock( + self.channels[1], self.channels[1], ks=2, stride=2, bn_mom=bn_momentum, non_lin=conv_deconv_non_lin + ), + ResidualBlock(self.channels[1], self.channels[2], ks=3, bn_mom=bn_momentum), + ] + self.stage2.extend( + [ + ResidualBlock(self.channels[2], self.channels[2], ks=3, bn_mom=bn_momentum) + for _ in range(1, down_res_blocks[1]) + ] + ) + self.stage2 = torch.nn.Sequential(*self.stage2) + + self.stage3 = [ + BasicConvolutionBlock( + self.channels[2], self.channels[2], ks=2, stride=2, bn_mom=bn_momentum, non_lin=conv_deconv_non_lin + ), + ResidualBlock(self.channels[2], self.channels[3], ks=3, bn_mom=bn_momentum), + ] + self.stage3.extend( + [ + ResidualBlock(self.channels[3], self.channels[3], ks=3, bn_mom=bn_momentum) + for _ in range(1, down_res_blocks[2]) + ] + ) + self.stage3 = torch.nn.Sequential(*self.stage3) + + self.stage4 = [ + BasicConvolutionBlock( + self.channels[3], self.channels[3], ks=2, stride=2, bn_mom=bn_momentum, non_lin=conv_deconv_non_lin + ), + ResidualBlock(self.channels[3], self.channels[4], ks=3, bn_mom=bn_momentum), + ] + self.stage4.extend( + [ + ResidualBlock(self.channels[4], self.channels[4], ks=3, bn_mom=bn_momentum) + for _ in range(1, down_res_blocks[3]) + ] + ) + self.stage4 = torch.nn.Sequential(*self.stage4) + + self.up1 = torch.nn.ModuleList( + [ + BasicDeconvolutionBlock( + self.channels[4], + self.channels[5], + ks=2, + stride=2, + bn_mom=bn_momentum, + ), + torch.nn.Sequential( + ResidualBlock(self.channels[5] + self.channels[3], self.channels[5], ks=3, bn_mom=bn_momentum), + ResidualBlock(self.channels[5], self.channels[5], ks=3, bn_mom=bn_momentum), + ), + ] + ) + + self.up2 = torch.nn.ModuleList( + [ + BasicDeconvolutionBlock( + self.channels[5], + self.channels[6], + ks=2, + stride=2, + bn_mom=bn_momentum, + ), + torch.nn.Sequential( + ResidualBlock(self.channels[6] + self.channels[2], self.channels[6], ks=3, bn_mom=bn_momentum), + ResidualBlock(self.channels[6], self.channels[6], ks=3, bn_mom=bn_momentum), + ), + ] + ) + + self.up3 = torch.nn.ModuleList( + [ + BasicDeconvolutionBlock( + self.channels[6], + self.channels[7], + ks=2, + stride=2, + bn_mom=bn_momentum, + ), + torch.nn.Sequential( + ResidualBlock(self.channels[7] + self.channels[1], self.channels[7], ks=3, bn_mom=bn_momentum), + ResidualBlock(self.channels[7], self.channels[7], ks=3, bn_mom=bn_momentum), + ), + ] + ) + + self.up4 = torch.nn.ModuleList( + [ + BasicDeconvolutionBlock( + self.channels[7], + self.channels[8], + ks=2, + stride=2, + bn_mom=bn_momentum, + ), + torch.nn.Sequential( + ResidualBlock(self.channels[8] + self.channels[0], self.channels[8], ks=3, bn_mom=bn_momentum), + ResidualBlock(self.channels[8], self.channels[8], ks=3, bn_mom=bn_momentum), + ), + ] + ) + + levels = [self.channels[-i] for i in range(4, 0, -1)] + + # conv mask projection + self.mask_feat = fvdb.nn.SparseConv3d( + self.channels[-1], + self.channels[-1], + kernel_size=3, + stride=1, + ) + + self.out_bnorm = torch.nn.ModuleList([torch.nn.Sequential() for _ in levels]) + + def forward(self, x) -> List[fvdb.nn.VDBTensor]: + + sparse_input = x["vdbtensor"] + + x0 = self.stem(sparse_input) # type: ignore + x1 = self.stage1(x0) # type: ignore + x2 = self.stage2(x1) # type: ignore + x3 = self.stage3(x2) # type: ignore + x4 = self.stage4(x3) # type: ignore + + y1 = self.up1[0](x4, out_grid=x3.grid) + y1 = fvdb.jcat([y1, x3], dim=1) + y1 = self.up1[1](y1) + + y2 = self.up2[0](y1, out_grid=x2.grid) + y2 = fvdb.jcat([y2, x2], dim=1) + y2 = self.up2[1](y2) + + y3 = self.up3[0](y2, out_grid=x1.grid) + y3 = fvdb.jcat([y3, x1], dim=1) + y3 = self.up3[1](y3) + + y4 = self.up4[0](y3, out_grid=x0.grid) + y4 = fvdb.jcat([y4, x0], dim=1) + y4 = self.up4[1](y4) + + out_feats = [y1, y2, y3, y4] + + feat_levels = self.output_feature_levels + [3] + + out_feats = [out_feats[feats] for feats in feat_levels] + + out_feats[-1] = self.mask_feat(out_feats[-1]) + + # batch norm + out_feats = [bn(feat) for feat, bn in zip(out_feats, self.out_bnorm)] + + return out_feats diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/blocks.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/blocks.py new file mode 100644 index 0000000000..6c4f4cbbd7 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/blocks.py @@ -0,0 +1,214 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from typing import Optional + +import torch +import torch.nn.functional as F + +import fvdb +import fvdb.nn + + +class SelfAttentionLayer(torch.nn.Module): + def __init__(self, d_model, nhead, dropout=0.0): + super().__init__() + self.self_attn = torch.nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True) + + self.norm = torch.nn.LayerNorm(d_model) + self.dropout = torch.nn.Dropout(dropout) + + self.activation = F.relu + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + torch.nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos: Optional[torch.Tensor]): + return tensor if pos is None else tensor + pos + + def forward( + self, + q_embed, + attn_mask: Optional[torch.Tensor] = None, + padding_mask: Optional[torch.Tensor] = None, + query_pos: Optional[torch.Tensor] = None, + ): + q = k = self.with_pos_embed(q_embed, query_pos) + q_embed2 = self.self_attn(q, k, value=q_embed, attn_mask=attn_mask, key_padding_mask=padding_mask)[0] + q_embed = q_embed + self.dropout(q_embed2) + q_embed = self.norm(q_embed) + return q_embed + + +class CrossAttentionLayer(torch.nn.Module): + def __init__(self, d_model, nhead, dropout=0.0): + super().__init__() + self.multihead_attn = torch.nn.MultiheadAttention(d_model, nhead, dropout=dropout, batch_first=True) + + self.norm = torch.nn.LayerNorm(d_model) + self.dropout = torch.nn.Dropout(dropout) + + self.activation = F.relu + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + torch.nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos: Optional[torch.Tensor]): + return tensor if pos is None else tensor + pos + + def with_pos_embed2(self, tensor, pos: Optional[torch.Tensor]): + out = torch.cat((tensor, pos.unsqueeze(0)), dim=-1) # type: ignore + return out + + def forward( + self, + q_embed, + bb_feat, + attn_mask: Optional[torch.Tensor] = None, + padding_mask: Optional[torch.Tensor] = None, + pos: Optional[torch.Tensor] = None, + query_pos: Optional[torch.Tensor] = None, + ): + q_embed = self.norm(q_embed) + q_embed2 = self.multihead_attn( + query=self.with_pos_embed(q_embed, query_pos), + key=self.with_pos_embed(bb_feat, pos), + value=self.with_pos_embed(bb_feat, pos), + # value=bb_feat, + attn_mask=attn_mask, + key_padding_mask=padding_mask, + )[0] + q_embed = q_embed + self.dropout(q_embed2) + return q_embed + + +class FFNLayer(torch.nn.Module): + def __init__(self, d_model, dim_feedforward=2048, dropout=0.0): + super().__init__() + # Implementation of Feedforward model + self.linear1 = torch.nn.Linear(d_model, dim_feedforward) + self.dropout = torch.nn.Dropout(dropout) + self.linear2 = torch.nn.Linear(dim_feedforward, d_model) + + self.norm = torch.nn.LayerNorm(d_model) + + self.activation = F.relu + + self._reset_parameters() + + def _reset_parameters(self): + for p in self.parameters(): + if p.dim() > 1: + torch.nn.init.xavier_uniform_(p) + + def with_pos_embed(self, tensor, pos: Optional[torch.Tensor]): + return tensor if pos is None else tensor + pos + + def forward(self, tgt): + tgt = self.norm(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = tgt + self.dropout(tgt2) + return tgt + + +class MLP(torch.nn.Module): + def __init__(self, input_dim, hidden_dim_list, output_dim, use_fvdb: bool = False): + super().__init__() + if use_fvdb: + linear_cls = fvdb.nn.Linear + relu_cls = fvdb.nn.ReLU + else: + linear_cls = torch.nn.Linear + relu_cls = torch.nn.ReLU + + self.num_layers = len(hidden_dim_list) + 1 + h = hidden_dim_list + self.layers = torch.nn.ModuleList(linear_cls(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.relu = relu_cls() + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = layer(x) + if i < self.num_layers - 1: + x = self.relu(x) + return x + + +class BasicConvolutionBlock(torch.nn.Module): + def __init__( + self, + inc, + outc, + ks=3, + stride=1, + dilation=1, + bn_mom=0.1, + non_lin=fvdb.nn.ReLU, + ): + super().__init__() + if dilation != 1: + raise NotImplementedError("Dilation not implemented for fVDB SparseConv3d") + self.net = torch.nn.Sequential( + fvdb.nn.SparseConv3d(inc, outc, kernel_size=ks, stride=stride), + fvdb.nn.BatchNorm(outc, momentum=bn_mom), + non_lin(inplace=True), + ) + + def forward(self, x): + out = self.net(x) + return out + + +class BasicDeconvolutionBlock(torch.nn.Module): + def __init__(self, inc, outc, ks=3, stride=1, bn_mom=0.1, non_lin=fvdb.nn.LeakyReLU): + super().__init__() + self.net = torch.nn.Sequential( + fvdb.nn.SparseConv3d(inc, outc, kernel_size=ks, stride=stride, transposed=True), + fvdb.nn.BatchNorm(outc, momentum=bn_mom), + non_lin(inplace=True), + ) + + def forward(self, x, out_grid=None): + for module in self.net: + if isinstance(module, fvdb.nn.SparseConv3d): + x = module(x, out_grid=out_grid) + else: + x = module(x) + return x + + +class ResidualBlock(torch.nn.Module): + def __init__(self, inc, outc, ks=3, stride=1, dilation=1, bn_mom=0.1): + super().__init__() + if dilation != 1: + raise NotImplementedError("Dilation not implemented for fVDB SparseConv3d") + self.net = torch.nn.Sequential( + fvdb.nn.SparseConv3d(inc, outc, kernel_size=ks, stride=stride), + fvdb.nn.BatchNorm(outc, momentum=bn_mom), + fvdb.nn.ReLU(inplace=True), + fvdb.nn.SparseConv3d(outc, outc, kernel_size=ks, stride=1), + fvdb.nn.BatchNorm(outc, momentum=bn_mom), + ) + + self.downsample = ( + torch.nn.Sequential() + if (inc == outc and stride == 1) + else torch.nn.Sequential( + fvdb.nn.SparseConv3d(inc, outc, kernel_size=1, stride=stride), + fvdb.nn.BatchNorm(outc, momentum=bn_mom), + ) + ) + + self.relu = fvdb.nn.ReLU(inplace=True) + + def forward(self, x): + out = self.relu(self.net(x) + self.downsample(x)) + return out diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/decoder.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/decoder.py new file mode 100644 index 0000000000..259171a874 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/decoder.py @@ -0,0 +1,212 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +# Modified by Rodrigo Marcuzzi from https://github.com/facebookresearch/Mask2Former +from typing import Dict, Tuple + +import torch + +from .blocks import MLP, CrossAttentionLayer, FFNLayer, SelfAttentionLayer +from .positional_encoder import PositionalEncoder + + +class MaskedTransformerDecoder(torch.nn.Module): + def __init__( + self, + num_classes, + dataset_extent: Tuple[float, float, float], + backbone_channels: Tuple[int, ...], + num_queries: int = 100, + ): + super().__init__() + self.backbone_channels = backbone_channels + hidden_dim = 256 + self.num_queries = num_queries + self.num_feature_levels = 1 + + self.pe_layer = PositionalEncoder(hidden_dim, dataset_extent) + + self.num_layers = 1 # feature levels + self.decoder_blocks = 6 + self.num_layers *= self.decoder_blocks + + self.num_heads = 8 + + self.feature_level_encoder = False + + self.transformer_self_attention_layers = torch.nn.ModuleList() + self.transformer_cross_attention_layers = torch.nn.ModuleList() + self.transformer_ffn_layers = torch.nn.ModuleList() + for _ in range(self.num_layers): + self.transformer_self_attention_layers.append( + SelfAttentionLayer(d_model=hidden_dim, nhead=self.num_heads, dropout=0.0) + ) + self.transformer_cross_attention_layers.append( + CrossAttentionLayer(d_model=hidden_dim, nhead=self.num_heads, dropout=0.0) + ) + self.transformer_ffn_layers.append( + FFNLayer( + d_model=hidden_dim, + dim_feedforward=1024, + dropout=0.0, + ) + ) + + self.decoder_norm = torch.nn.LayerNorm(hidden_dim) + self.query_feat = torch.nn.Embedding(self.num_queries, hidden_dim) + self.query_embed = torch.nn.Embedding(self.num_queries, hidden_dim) + if self.feature_level_encoder: + self.level_embed = torch.nn.Embedding(self.num_feature_levels, hidden_dim) + + self.mask_feat_proj = torch.nn.Sequential() + in_channels = self.backbone_channels # backbone channels + if in_channels[-1] != hidden_dim: + self.mask_feat_proj = torch.nn.Linear(in_channels[-1], hidden_dim) + + in_channels = in_channels[:-1][-self.num_feature_levels :] + + self.input_proj = torch.nn.ModuleList() + for ch in in_channels: + if ch != hidden_dim: # linear projection to hidden_dim + self.input_proj.append(torch.nn.Linear(ch, hidden_dim)) + else: + self.input_proj.append(torch.nn.Sequential()) + + # output FFNs + + self.class_embed = torch.nn.Linear(hidden_dim, num_classes + 1) + self.mask_embed = MLP(hidden_dim, [hidden_dim, hidden_dim], hidden_dim) + + def forward(self, feats, coors, pad_masks) -> Tuple[Dict, torch.Tensor]: + last_coors = coors.pop() + last_feat = feats.pop() + + mask_features = self.mask_feat_proj(last_feat) + self.pe_layer(last_coors) + last_pad = pad_masks.pop() + src = [] + pos = [] + size_list = [] + + for i in range(self.num_feature_levels): + size_list.append(feats[i].shape[1]) + pos.append(self.pe_layer(coors[i])) + + feat = self.input_proj[i](feats[i]) + + src.append(feat) + + bs = src[0].shape[0] + query_embed = self.query_embed.weight.unsqueeze(0).repeat(bs, 1, 1) + output = self.query_feat.weight.unsqueeze(0).repeat(bs, 1, 1) + + predictions_class = [] + predictions_class_sem = [] + predictions_mask = [] + predictions_sem_embed = [] + + # predictions on learnable query features, first attn_mask + pred_result = None + pred_result = self.pred_heads( + output, + mask_features, + pad_mask=last_pad, + ) + + predictions_class.append(pred_result["outputs_class"]) + predictions_class_sem.append(pred_result["outputs_class_sem"]) + predictions_mask.append(pred_result["outputs_mask"]) + predictions_sem_embed.append(pred_result["outputs_sem_embed"]) + + for i in range(self.num_layers): + level_index = i % self.num_feature_levels + + attn_mask = None + if pred_result is not None: + attn_mask = pred_result["attn_mask"] + attn_mask[torch.where(attn_mask.sum(-1) == attn_mask.shape[-1])] = False + + # cross-attention first + output = self.transformer_cross_attention_layers[i]( + output, + src[level_index], + attn_mask=attn_mask, + padding_mask=pad_masks[level_index], + pos=pos[level_index], + query_pos=query_embed, + ) + output = self.transformer_self_attention_layers[i]( + output, attn_mask=None, padding_mask=None, query_pos=query_embed + ) + + # FFN + output = self.transformer_ffn_layers[i](output) + + # get predictions and attn mask for next feature level + pred_result = self.pred_heads( + output, + mask_features, + pad_mask=last_pad, + ) + + predictions_class.append(pred_result["outputs_class"]) + predictions_class_sem.append(pred_result["outputs_class_sem"]) + predictions_mask.append(pred_result["outputs_mask"]) + predictions_sem_embed.append(pred_result["outputs_sem_embed"]) + + assert len(predictions_mask) == self.num_layers + 1 + + out = { + "pred_logits": predictions_class[-1], + "pred_logits_sem": predictions_class_sem[-1], + "pred_masks": predictions_mask[-1], + "pred_sem_embed": predictions_sem_embed[-1], + "query_embeddings": output, + } + + return out, last_pad + + def pred_heads( + self, + output, + mask_features, + pad_mask=None, + ): + decoder_output = self.decoder_norm(output) + mask_embed = self.mask_embed(decoder_output) + + # The mask predictions outputs_mask are computed as a combination of mask_embed and mask_features + # using the einsum function from PyTorch. + # This function performs a batch-wise matrix multiplication between mask_embed and mask_features + # and outputs a tensor of shape (batch_size, num_points, num_queries). + # The result is a tensor that represents the mask prediction for each query and point in the batch. + outputs_mask = torch.einsum("bqc,bpc->bpq", mask_embed, mask_features) + + attn_mask = (outputs_mask.sigmoid() < 0.5).detach().bool() + attn_mask[pad_mask] = True + attn_mask = attn_mask.unsqueeze(1).repeat(1, self.num_heads, 1, 1).flatten(0, 1).permute(0, 2, 1) + + # The *outputs_class* tensor holds the "semantic information," + # essentially a distribution over possible classes for each of the queries. + # Each slice along the num_queries dimension can be seen as the model's prediction of the class + # of the object that each query represents. + # Dim: (num_batch, num_classes, num_query) + + # The *outputs_mask* tensor provides spatial information, indicating which points + # from the input point cloud are associated with each query. + # Each slice along the num_queries dimension in this tensor can be seen as a mask over + # the point cloud, highlighting the points that the model + # believes belong to the object represented by the corresponding query. + # Dim: (num_batch, num_points, num_query) + + result = { + "outputs_mask": outputs_mask, + "attn_mask": attn_mask, + } + + result["outputs_class"] = self.class_embed(decoder_output) + + result["outputs_class_sem"] = None # type: ignore + + result["outputs_sem_embed"] = None # type: ignore + + return result diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/loss.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/loss.py new file mode 100644 index 0000000000..2cab16ac82 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/loss.py @@ -0,0 +1,204 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from itertools import filterfalse + +import numpy as np +import torch +import torch.nn +import torch.nn.functional as F + +from .mask_model import MaskPLS + + +class SemLoss(torch.nn.Module): + def __init__( + self, + ignore_class: int, + sem_distil: bool, + loss_scales: dict, + input_mode: MaskPLS.DecoderInputMode = MaskPLS.DecoderInputMode.GRID, + ) -> None: + super().__init__() + self.ignore_class = ignore_class + self.sem_distil = sem_distil + self.weight_dict = loss_scales + self.input_mode = input_mode + + self.cross_entropy = torch.nn.CrossEntropyLoss(ignore_index=self.ignore_class) + + def forward(self, outputs, targets) -> dict: + padding = outputs["padding"] + bb_sem_logits = outputs["bb_sem_logits"] + bb_sem_embed_logits = outputs["bb_sem_embed_logits"] + + sem_labels = [torch.from_numpy(i).type(torch.long).to(padding.device) for i in targets["semantic_labels"]] + sem_labels = torch.cat(sem_labels) + + if self.input_mode == MaskPLS.DecoderInputMode.GRID: + # If the input to the loss function (which is the same as the input/output from the decoder) is the grid centers, + # (i.e. not the original xyz coordinates), we need to convert the targets to the grid centers as well. + input_vdbtensor = targets["vdbtensor"] + + # map target semantic labels to the grid + points = targets["xyz"] + + # get mapping of the coordinates to the grid for feature mapping + coord_ijks = input_vdbtensor.grid.world_to_grid(points).round().int() + inv_idx = input_vdbtensor.grid.ijk_to_inv_index(coord_ijks, cumulative=True) + sem_labels = sem_labels[inv_idx.jdata] + + sem_targets = sem_labels + + outputs = [] + batch_size = bb_sem_logits.shape[0] + for b in range(batch_size): + bb_sem_logit = bb_sem_logits[b][~padding[b]] + + if not self.training: + bb_sem_logit = bb_sem_logit[targets["inv_ind"][b]] + + outputs.append(bb_sem_logit) + + outputs = torch.cat(outputs) + + loss_sem_ce = self.cross_entropy(outputs, sem_targets) + loss_sem_lov = self.lovasz_softmax(F.softmax(outputs, dim=1), sem_targets, ignore=self.ignore_class) + + loss_dict = dict() + + # we check if loss is enabled for cleaner coding + loss_dict["loss_backbone_sem_ce"] = self.weight_dict.get("loss_backbone_sem_ce", 0.0) * loss_sem_ce + + # we check if loss is enabled for cleaner coding + loss_dict["loss_backbone_sem_lov"] = self.weight_dict.get("loss_backbone_sem_lov", 0.0) * loss_sem_lov + + if self.sem_distil and "loss_backbone_sem_distil" in self.weight_dict and targets["sem_embed"][0] is not None: + input1_list = [] + input2_list = [] + + for b in range(batch_size): + sem_embed_logits = bb_sem_embed_logits[b][~padding[b]] + + masks_ids = targets["masks_ids"][b] + num_masks = len(masks_ids) + + instance_ids = [np.unique(targets["ins_labels"][b][m_ids.cpu().numpy()]) for m_ids in masks_ids] + for instance_id in instance_ids: + assert len(instance_id) == 1 + + sem_embed = targets["sem_embed"][b] + sem_embed_ins = list(sem_embed["ins"]) + + embeds_ids = [sem_embed_ins.index(instance_id[0]) for instance_id in instance_ids] + + sem_embeds = sem_embed["embeds"][embeds_ids] + + for i in range(num_masks): + input1_list.append(sem_embed_logits[masks_ids[i]]) + input2_list.append(sem_embeds[i][None, ...].repeat(len(masks_ids[i]), 1)) + + input1 = torch.cat(input1_list) + input2 = torch.cat(input2_list) + + target = torch.ones(len(input2)).to(input2.device) + loss_distil = F.cosine_embedding_loss(input1, input2, target=target) + + loss_dict["loss_backbone_sem_distil"] = loss_distil * self.weight_dict["loss_backbone_sem_distil"] + + return loss_dict + + def lovasz_grad(self, gt_sorted): + """ + Computes gradient of the Lovasz extension w.r.t sorted errors + See Alg. 1 in paper + """ + p = len(gt_sorted) + gts = gt_sorted.sum() + intersection = gts - gt_sorted.float().cumsum(0) + union = gts + (1 - gt_sorted).float().cumsum(0) + jaccard = 1.0 - intersection / union + if p > 1: # cover 1-pixel case + jaccard[1:p] = jaccard[1:p] - jaccard[0:-1] + return jaccard + + def lovasz_softmax(self, probas, labels, classes="present", ignore=None): + """ + Multi-class Lovasz-Softmax loss + probas: [B, C, H, W] Variable, class probabilities at each prediction (between 0 and 1). + Interpreted as binary (sigmoid) output with outputs of size [B, H, W]. + labels: [B, H, W] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + per_image: compute the loss per image instead of per batch + ignore: void class labels + """ + loss = self.lovasz_softmax_flat(*self.flatten_probas(probas, labels, ignore), classes=classes) + return loss + + def lovasz_softmax_flat(self, probas, labels, classes="present"): + """ + Multi-class Lovasz-Softmax loss + probas: [P, C] Variable, class probabilities at each prediction (between 0 and 1) + labels: [P] Tensor, ground truth labels (between 0 and C - 1) + classes: 'all' for all, 'present' for classes present in labels, or a list of classes to average. + """ + if probas.numel() == 0: + # only void pixels, the gradients should be 0 + return probas * 0.0 + C = probas.size(1) + losses = [] + class_to_sum = list(range(C)) if classes in ["all", "present"] else classes + for c in class_to_sum: + fg = (labels == c).float() # foreground for class c + if classes == "present" and fg.sum() == 0: + continue + if C == 1: + if len(classes) > 1: + raise ValueError("Sigmoid output possible only with 1 class") + class_pred = probas[:, 0] + else: + class_pred = probas[:, c] + errors = (torch.autograd.Variable(fg) - class_pred).abs() + errors_sorted, perm = torch.sort(errors, 0, descending=True) + perm = perm.data + fg_sorted = fg[perm] + losses.append(torch.dot(errors_sorted, torch.autograd.Variable(self.lovasz_grad(fg_sorted)))) + return self.mean(losses) + + def flatten_probas(self, probas, labels, ignore=None): + """ + Flattens predictions in the batch + """ + # Probabilities from SparseTensor.features already flattened + N, C = probas.size() + probas = probas.contiguous().view(-1, C) + labels = labels.view(-1) + if ignore is None: + return probas, labels + valid = labels != ignore + vprobas = probas[torch.nonzero(valid).squeeze()] + vlabels = labels[valid] + return vprobas, vlabels + + def isnan(self, x): + return x != x + + def mean(self, l, ignore_nan=False, empty=0): + """ + nanmean compatible with generators. + """ + l = iter(l) + if ignore_nan: + l = filterfalse(self.isnan, l) + try: + n = 1 + acc = next(l) + except StopIteration: + if empty == "raise": + raise ValueError("Empty mean") + return empty + for n, v in enumerate(l, 2): + acc += v + if n == 1: + return acc + return acc / n diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/mask_model.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/mask_model.py new file mode 100644 index 0000000000..a1080866e6 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/mask_model.py @@ -0,0 +1,117 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from enum import Enum +from typing import Dict, Tuple + +import torch +import torch.nn + +import fvdb +import fvdb.nn + +from .backbone import MaskPLSEncoderDecoder, fVDBSyncBatchNorm +from .blocks import MLP +from .decoder import MaskedTransformerDecoder +from .utils import pad_batch + + +class MaskPLS(torch.nn.Module): + class DecoderInputMode(Enum): + XYZ = "xyz" + GRID = "grid" + + def __init__( + self, + num_classes: int, + dataset_extent: Tuple[float, float, float], + decoder_input_mode: DecoderInputMode = DecoderInputMode.GRID, + decoder_num_queries: int = 100, + segmentation_only=False, + ) -> None: + """ + Mask-Based Panoptic LiDAR Segmentation for Autonomous Driving + https://github.com/PRBonn/MaskPLS + Args: + num_classes (int): Number of classes for segmentation. + dataset_extent (Tuple[float, float, float]): The magnitude of the spatial extents of the dataset. + decoder_input_mode (DecoderInputMode, optional): Mode for decoder input. Defaults to DecoderInputMode.GRID. + decoder_num_queries (int, optional): Number of queries for the decoder. Defaults to 100. + segmentation_only (bool, optional): If True, only segmentation is performed, masked decoder not used. Defaults to False. + Returns: + None + """ + super().__init__() + self.decoder_input_mode = decoder_input_mode + self.segmentation_only = segmentation_only + + backbone = MaskPLSEncoderDecoder(output_feature_levels=[3]) + self.backbone = fVDBSyncBatchNorm.convert_sync_batchnorm(backbone) + + self.sem_head = ( + fvdb.nn.Linear(self.backbone.channels[-1], num_classes) + if self.decoder_input_mode == MaskPLS.DecoderInputMode.GRID + else torch.nn.Linear(self.backbone.channels[-1], num_classes) + ) + + self.semantic_embedding_distil = False + if self.semantic_embedding_distil: + semantic_embedding_hidden_dims = [512, 1024, 768] + self.sem_embed = MLP( + self.backbone.channels[-1], + semantic_embedding_hidden_dims[:-1], + semantic_embedding_hidden_dims[-1], + use_fvdb=(self.decoder_input_mode == MaskPLS.DecoderInputMode.GRID), + ) + + if not self.segmentation_only: + self.decoder = MaskedTransformerDecoder( + num_classes, dataset_extent, backbone_channels=self.backbone.channels, num_queries=decoder_num_queries + ) + + def forward(self, x: Dict): + outputs = {} + + ###### Backbone ###### + out_feats_grids = self.backbone(x) + # out_feats_grids is a List[fvdb.nn.VDBTensor] + # where each VDBTensor corresponds to the `ouput_feature_levels` + # plus 1 additional entry which is the last/full-resolution feature level run through the conv mask projection + + ###### v2p ###### + # NOTE: Matching MaskPLS paper which performs v2p before sem_head + # In SAL, features are at voxel centers throughout, so we provide an option to try either + if self.decoder_input_mode == MaskPLS.DecoderInputMode.XYZ: + # If decoder inputs are the original points, we need to sample the features in the grid and pad them for form + # a minibatch for the semantic head and decoder + xyz = x["xyz"] + feats = [feats_grid.sample_trilinear(xyz).unbind() for feats_grid in out_feats_grids] + + # pad batch + feats, coords, pad_masks = pad_batch(feats, [xyz.unbind() for _ in feats]) # type: ignore + else: + feats = out_feats_grids + + logits = [self.sem_head(feats[-1])] + + if self.semantic_embedding_distil: + logits_sem_embed_grid = self.sem_embed(feats[-1]) + + if self.decoder_input_mode == MaskPLS.DecoderInputMode.GRID: + # produce a padded batch for the decoder and loss + coords = [feat.grid.grid_to_world(feat.ijk.float()).unbind() for feat in out_feats_grids] + feats = [feat.data.unbind() for feat in out_feats_grids] + logits = [ls.data.unbind() for ls in logits] + feats, coords, pad_masks, logits = pad_batch(feats, coords, additional_feats=logits) # type: ignore + + ###### Decoder ###### + if self.segmentation_only: + padding = pad_masks.pop() + else: + outputs, padding = self.decoder(feats, coords, pad_masks) + + outputs["bb_sem_logits"] = logits[0] + outputs["bb_sem_embed_logits"] = None if not self.semantic_embedding_distil else logits_sem_embed_grid + outputs["padding"] = padding + + return outputs diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/positional_encoder.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/positional_encoder.py new file mode 100644 index 0000000000..c8f4c162a6 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/positional_encoder.py @@ -0,0 +1,56 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +import math + +import torch +import torch.nn as nn + + +class PositionalEncoder(nn.Module): + def __init__(self, feat_size, dataset_extent): + super().__init__() + self.feat_size = feat_size + self.dataset_extent = torch.tensor(dataset_extent) + + self.max_freq = 10000 + self.dimensionality = 3 + self.num_bands = math.floor(feat_size / self.dimensionality / 2) + self.base = 2 + self.activated = True + + pad = feat_size - self.num_bands * 2 * self.dimensionality + self.zero_pad = nn.ZeroPad2d((pad, 0, 0, 0)) # left padding + + def forward(self, coors): + """ + _x [B,N,3]: batched point coordinates + returns: [B,N,C]: positional encoding of dimension C + """ + if not self.activated: + return torch.zeros( + coors.shape[0], + coors.shape[1], + self.feat_size, + ).to(coors.device) + + x = coors.clone() # B, N, 3 + + x = x / self.dataset_extent.to(x.device) # B, N, 3 + + x = x.unsqueeze(-1) # B, N, 3 -> B, N, 3, 1 + scales = torch.logspace( + 0.0, + math.log(self.max_freq / 2) / math.log(self.base), + self.num_bands, + base=self.base, + device=x.device, + dtype=x.dtype, + ) + # reshaping + scales = scales[(*((None,) * (len(x.shape) - 1)), ...)] + x = x * scales * math.pi + x = torch.cat([x.sin(), x.cos()], dim=-1) # B, N, 3, 2 + x = x.flatten(2) # B, N, 6 + enc = self.zero_pad(x) # B, N, feat_dim + return enc diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/utils.py b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/utils.py new file mode 100644 index 0000000000..2e53771910 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/models/mask_pls/utils.py @@ -0,0 +1,39 @@ +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +from typing import List, Optional + +import torch +import torch.nn.functional as F + + +def pad_batch(feats, coors=None, additional_feats: Optional[List] = None): + """ + From a list of multi-level features create a list of batched tensors with + features padded to the max number of points in the batch. + + returns: + feats: List of batched feature Tensors per feature level + coors: List of batched coordinate Tensors per feature level + pad_masks: List of batched bool Tensors indicating padding + """ + # get max number of points in the batch for each feature level + maxs = [max([level.shape[0] for level in batch]) for batch in feats] + # pad and batch each feature level in a single Tensor + if coors is not None: + coors = [ + torch.stack([F.pad(f, (0, 0, 0, maxs[i] - f.shape[0])) for f in batch]) for i, batch in enumerate(coors) + ] + pad_masks = [ + torch.stack([F.pad(torch.zeros_like(f[:, 0]), (0, maxs[i] - f.shape[0]), value=1).bool() for f in batch]) + for i, batch in enumerate(feats) + ] + feats = [torch.stack([F.pad(f, (0, 0, 0, maxs[i] - f.shape[0])) for f in batch]) for i, batch in enumerate(feats)] + if additional_feats is not None: + additional_feats = [ + torch.stack([F.pad(f, (0, 0, 0, maxs[i] - f.shape[0])) for f in batch]) + for i, batch in enumerate(additional_feats) + ] + + return feats, coors, pad_masks, additional_feats + return feats, coors, pad_masks diff --git a/fvdb/projects/panoptic_segmentation/mask_pls/train.py b/fvdb/projects/panoptic_segmentation/mask_pls/train.py new file mode 100644 index 0000000000..e5ae56f5d8 --- /dev/null +++ b/fvdb/projects/panoptic_segmentation/mask_pls/train.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python3 +# Copyright Contributors to the OpenVDB Project +# SPDX-License-Identifier: Apache-2.0 +# +import logging +from enum import Enum +from pathlib import Path +from typing import Tuple + +import torch +import torch.utils +import torch.utils.data +import tqdm +import tyro +from data import ( + E57Dataset, + SemanticKITTIDataset, + SemanticSegmentationDatasetCollation, + fVDBSemanticSegmentationDatasetCollation, +) +from models import MaskPLS +from models.mask_pls.loss import SemLoss + + +class DatasetType(Enum): + E57 = "e57" + SemanticKITTI = "semanticKITTI" + + +def main( + dataset_type: DatasetType, + dataset_path: Path, + dataset_spatial_normalization: Tuple[float, float, float], + batch_size: int = 4, + decoder_input_mode: MaskPLS.DecoderInputMode = MaskPLS.DecoderInputMode.GRID, + decoder_num_queries: int = 100, +): + """Example project training a MaskPLS model for panoptic segmentation of LiDAR. + + Args: + dataset_type (DatasetType): Type of dataset to use (E57 or SemanticKITTI). + The SemanticKITTI dataset is expected to be in the standard SemanticKITTI format. + E57 is expected to be a directory containing E57 files and currently 'ground truth' labels are implemented as random classes + dataset_path (Path): Path to the dataset directory + dataset_spatial_normalization (Tuple[float, float, float]): Normalization factors for spatial coordinates of the points. + This should be set to the magnitude of the maximum spatial extent of the dataset in each dimension. + (i.e. if the dataset's points are in the range [-80, 100] in x, [-50, 20] in y, and [-10, 4] in z, this should be (100, 50, 10)) + batch_size (int, optional): Batch size for training. + decoder_input_mode (MaskPLS.DecoderInputMode, optional): Input mode for the spatial input to the decoder. + GRID corresponds to the grid centers of the sparse grid, POINTS corresponds to the original points. Using GRID can reduce memory usage whereas the original MaskPLS paper used POINTS. + decoder_num_queries (int, optional): Number of queries to use for the decoder. More queries will increase memory usage. + + """ + #### Dataset #### + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + if dataset_type == DatasetType.E57: + dataset = E57Dataset(dataset_path) + elif dataset_type == DatasetType.SemanticKITTI: + dataset = SemanticKITTIDataset(dataset_path, split="train") + dataloader = torch.utils.data.DataLoader( + dataset, + batch_size=batch_size, + collate_fn=SemanticSegmentationDatasetCollation(), + ) + + fvdb_collate = fVDBSemanticSegmentationDatasetCollation(device=device) + + #### Model/Loss/Optimizer #### + model = MaskPLS( + dataset.num_classes, + dataset_spatial_normalization, + segmentation_only=False, + decoder_input_mode=decoder_input_mode, + decoder_num_queries=decoder_num_queries, + ).to(device) + + ignore_class = dataset.ignore_classes[0] + + backbone_loss_sem = SemLoss( + ignore_class, + sem_distil=False, + loss_scales={"loss_backbone_sem_ce": 2.0, "loss_backbone_sem_lov": 6.0}, + input_mode=decoder_input_mode, + ).to(device) + + optimizer = torch.optim.AdamW( + [p for p in model.parameters() if p.requires_grad], + lr=0.0001, + weight_decay=0.0001, + ) + + scheduler = torch.optim.lr_scheduler.MultiStepLR( + optimizer, + milestones=[45000, 55000], + gamma=0.1, + ) + + #### Training #### + with tqdm.tqdm(dataloader) as pbar: + network_time = 0.0 + for i, batch in enumerate(pbar): + start = torch.cuda.Event(enable_timing=True) + end = torch.cuda.Event(enable_timing=True) + + start.record() + + batch = fvdb_collate(batch) + + optimizer.zero_grad() + + outputs = model(batch) + + loss_dict = backbone_loss_sem(outputs, batch) + loss = sum(loss_dict.values()) + loss.backward() + optimizer.step() + + end.record() + + torch.cuda.synchronize() + network_time += start.elapsed_time(end) / 1000 + pbar.set_postfix(loss=f"{loss.item():.4f}", network_time=f"{network_time/(i+1):.2f}s/it") + + scheduler.step() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + tyro.cli(main) diff --git a/fvdb/setup.py b/fvdb/setup.py index 5d1f545e2a..37d756f6d5 100644 --- a/fvdb/setup.py +++ b/fvdb/setup.py @@ -265,8 +265,12 @@ def download_and_install_cudnn() -> Tuple[List[str], List[str]]: for arch_flag in cpp_extension._get_cuda_arch_flags(): match = re.search(r"code=sm_(\d+)", arch_flag) if match: - if int(match.group(1)) < 80: - raise RuntimeError("ƒVDB requires a minimum compute capability of 8.0 (Ampere)") + cuda_arch = int(match.group(1)) + if cuda_arch < 70: + raise RuntimeError( + f"""A CUDA arch build target of {cuda_arch/10} was specified but ƒVDB must be built for at least compute capability 7.0 (Volta generation). + Please set TORCH_CUDA_ARCH_LIST to a list of supported architectures >=7.0.""" + ) external_dir = get_external_dir() @@ -356,6 +360,7 @@ def retrieve_version(file_path: Path = get_cwd() / "fvdb/__init__.py") -> str: packages=[ "fvdb", "fvdb.nn", + "fvdb.optim", "fvdb.utils", "fvdb.utils.examples", "fvdb.utils.tests", diff --git a/fvdb/src/detail/build/Build.h b/fvdb/src/detail/build/Build.h index 38a323e4c0..3d5cbe14f2 100644 --- a/fvdb/src/detail/build/Build.h +++ b/fvdb/src/detail/build/Build.h @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #ifndef FVDB_DETAIL_BUILD_BUILD_H #define FVDB_DETAIL_BUILD_BUILD_H diff --git a/fvdb/src/detail/build/CoarseFromFine.cpp b/fvdb/src/detail/build/CoarseFromFine.cpp index d4b789d063..75768379eb 100644 --- a/fvdb/src/detail/build/CoarseFromFine.cpp +++ b/fvdb/src/detail/build/CoarseFromFine.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/ConvGrid.cpp b/fvdb/src/detail/build/ConvGrid.cpp index cfa2e56e8d..e44188270b 100644 --- a/fvdb/src/detail/build/ConvGrid.cpp +++ b/fvdb/src/detail/build/ConvGrid.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/DenseGrid.cpp b/fvdb/src/detail/build/DenseGrid.cpp index 5b3b080fc0..192a7c53f7 100644 --- a/fvdb/src/detail/build/DenseGrid.cpp +++ b/fvdb/src/detail/build/DenseGrid.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/EmptyGrid.cpp b/fvdb/src/detail/build/EmptyGrid.cpp index eb07a527d5..8d0f82d7a2 100644 --- a/fvdb/src/detail/build/EmptyGrid.cpp +++ b/fvdb/src/detail/build/EmptyGrid.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/FineFromCoarse.cpp b/fvdb/src/detail/build/FineFromCoarse.cpp index d9355a5e82..6c62ae761d 100644 --- a/fvdb/src/detail/build/FineFromCoarse.cpp +++ b/fvdb/src/detail/build/FineFromCoarse.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/FromMesh.cpp b/fvdb/src/detail/build/FromMesh.cpp index b0025b2419..1ae906c339 100644 --- a/fvdb/src/detail/build/FromMesh.cpp +++ b/fvdb/src/detail/build/FromMesh.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp b/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp index 805aa358a7..aaf9e611bb 100644 --- a/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp +++ b/fvdb/src/detail/build/NearestNeighborGridFromPoints.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/PaddedGridFromCoords.cpp b/fvdb/src/detail/build/PaddedGridFromCoords.cpp index 275c393b0d..3e7a02d4d7 100644 --- a/fvdb/src/detail/build/PaddedGridFromCoords.cpp +++ b/fvdb/src/detail/build/PaddedGridFromCoords.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/PaddedGridFromGrid.cpp b/fvdb/src/detail/build/PaddedGridFromGrid.cpp index 4f78ee99f7..cba219d532 100644 --- a/fvdb/src/detail/build/PaddedGridFromGrid.cpp +++ b/fvdb/src/detail/build/PaddedGridFromGrid.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h" diff --git a/fvdb/src/detail/build/PaddedGridFromPoints.cpp b/fvdb/src/detail/build/PaddedGridFromPoints.cpp index 2c4219f185..b6e7414700 100644 --- a/fvdb/src/detail/build/PaddedGridFromPoints.cpp +++ b/fvdb/src/detail/build/PaddedGridFromPoints.cpp @@ -1,5 +1,5 @@ // Copyright Contributors to the OpenVDB Project -// SPDX-License-Identifier: MPL-2.0 +// SPDX-License-Identifier: Apache-2.0 // #include "Build.h"