diff --git a/examples/dynamo/cudagraphs_wrapper_example.py b/examples/dynamo/cudagraphs_wrapper_example.py deleted file mode 100644 index 20e19a72ce..0000000000 --- a/examples/dynamo/cudagraphs_wrapper_example.py +++ /dev/null @@ -1,111 +0,0 @@ -""" -.. _cudagraphs_wrapper_example: - -Wrapped runtime module for cuda graphs -====================================== - -If Torch-TensorRT encounters unsupported operations during compilation, it can fall back to using -PyTorch's native implementation for those specific operations. This fallback mechanism allows the -rest of the model to be executed using TensorRT, while only the unsupported parts are handled by PyTorch. -This fallback results in a graph break, which can reduce the overall performance benefits of using -TensorRT because it introduces additional overhead from switching between TensorRT and PyTorch execution contexts - -Applying CUDA Graphs to a PyTorch module that contains graph breaks can enhance performance by leveraging -the benefits of CUDA Graphs even in the presence of these breaks. Torch-TensorRT provides -wrapper runtime module with CUDA Graphs for modules that have graph breaks allows you to mitigate the -inefficiencies introduced by these breaks -""" - -# %% -# Imports and Model Definition -# ---------------------------------- - -import torch -import torch_tensorrt - - -class SampleModel(torch.nn.Module): - def forward(self, x): - return torch.relu((x + 2) * 0.5) - - -model = SampleModel().eval().cuda() -input = torch.randn((1, 3, 224, 224)).to("cuda") - -# %% -# Compiler options -# ---------------------------------- -# -# The 'torch_executed_ops' compiler option is used to demonstrate the module with graph breaks for this example. -# debug=True compiler option provides detailed insights into the compilation process and helps -# pinpoint where graph breaks occur - -# Create a TensorRT-compiled model -trt_model = torch_tensorrt.compile( - model, - ir="dynamo", - inputs=[input], - min_block_size=1, - pass_through_build_failures=True, -) - -trt_model_with_graph_break = torch_tensorrt.compile( - model, - ir="dynamo", - inputs=[input], - min_block_size=1, - pass_through_build_failures=True, - debug=True, - torch_executed_ops={"torch.ops.aten.mul.Tensor"}, -) - -# %% -# Compiler log -# ---------------------------------- -# -# This compiler log indicates torch.ops.aten.mul.Tensor operator is executed by PyTorch. -# Peformance of this module can be enhanced by using wrapped module. - -############################################################################## -# .. code-block:: none -# -# ++++++++++++++ Dry-Run Results for Graph +++++++++++++++++ -# -# The graph consists of 3 Total Operators, of which 2 operators are supported, 66.67% coverage -# -# The following ops are currently unsupported or excluded from conversion, and are listed with their op-count in the graph: -# torch.ops.aten.mul.Tensor: 1 -# -# The following nodes are currently set to run in Torch: -# Node: torch.ops.aten.mul.Tensor, with layer location: /mul -# Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner - -# %% -# trt module with cuda graphs -# ---------------------------------- -# - -with torch_tensorrt.runtime.enable_cudagraphs(trt_model) as cudagraphs_module: - cudagraphs_module(input) - -# %% -# Running wrapped module with cuda graphs -# ---------------------------------- -# -# When CUDA Graphs are applied to a TensorRT model that contains graph breaks, each break introduces additional -# overhead. This occurs because graph breaks prevent the entire model from being executed as a single, continuous -# optimized unit. As a result, some of the performance benefits typically provided by CUDA Graphs, such as reduced -# kernel launch overhead and improved execution efficiency, may be diminished. -# Using a wrapped runtime module with CUDA Graphs allows you to encapsulate sequences of operations into graphs -# that can be executed efficiently, even in the presence of graph breaks. When a CUDA Graph context manager is -# used with the TensorRT module as a positional argument, it returns a wrapped_module. This module captures the -# execution graph, enabling efficient replay during subsequent inferences by reducing kernel launch overheads -# and improving performance. Note that initializing with the wrapper module involves a warm-up phase where the -# module is executed several times. This warm-up ensures that memory allocations and initializations are not -# recorded in CUDA Graphs, which helps maintain consistent execution paths and optimize performance. -with torch_tensorrt.runtime.enable_cudagraphs( - trt_model_with_graph_break -) as cudagraphs_module: - cudagraphs_module(input) - -# %% diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py index 855c75a057..302928a784 100644 --- a/py/torch_tensorrt/_compile.py +++ b/py/torch_tensorrt/_compile.py @@ -12,8 +12,8 @@ from torch_tensorrt._features import ENABLED_FEATURES from torch_tensorrt._Input import Input from torch_tensorrt.dynamo import _defaults -from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import ( - WrapperTorchTensorRTModule, +from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import ( + CudaGraphsTorchTensorRTModule, ) from torch_tensorrt.fx import InputTensorSpec from torch_tensorrt.fx.lower import compile as fx_compile @@ -589,7 +589,7 @@ def save( Save the model to disk in the specified output format. Arguments: - module (Optional(torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule | WrapperTorchTensorRTModule)): Compiled Torch-TensorRT module + module (Optional(torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule | CudaGraphsTorchTensorRTModule)): Compiled Torch-TensorRT module inputs (torch.Tensor): Torch input tensors arg_inputs (Tuple[Any, ...]): Same as inputs. Alias for better understanding with kwarg_inputs. kwarg_inputs (dict[Any, ...]): Optional, kwarg inputs to the module forward function. @@ -597,7 +597,7 @@ def save( retrace (bool): When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it. This flag is experimental for now. """ - if isinstance(module, WrapperTorchTensorRTModule): + if isinstance(module, CudaGraphsTorchTensorRTModule): module = module.compiled_module module_type = _parse_module_type(module) accepted_formats = {"exported_program", "torchscript"} diff --git a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py similarity index 97% rename from py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py rename to py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py index 86133b43c2..e7afeef398 100644 --- a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py +++ b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py @@ -11,7 +11,7 @@ logger = logging.getLogger(__name__) -class WrapperTorchTensorRTModule(torch.nn.Module): # type: ignore[misc] +class CudaGraphsTorchTensorRTModule(torch.nn.Module): # type: ignore[misc] """This Wrapper runtime module is to record/replay whole cuda graph in sub modules Args: @@ -24,7 +24,7 @@ def __init__( self, compiled_module: torch.nn.Module, ): - super(WrapperTorchTensorRTModule, self).__init__() + super(CudaGraphsTorchTensorRTModule, self).__init__() self.compiled_module = compiled_module self.inputs = partitioning.construct_submodule_inputs(compiled_module) diff --git a/py/torch_tensorrt/runtime/_cudagraphs.py b/py/torch_tensorrt/runtime/_cudagraphs.py index e9893ec64e..6bdc07cbce 100644 --- a/py/torch_tensorrt/runtime/_cudagraphs.py +++ b/py/torch_tensorrt/runtime/_cudagraphs.py @@ -3,8 +3,8 @@ import torch import torch_tensorrt -from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import ( - WrapperTorchTensorRTModule, +from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import ( + CudaGraphsTorchTensorRTModule, ) @@ -90,7 +90,7 @@ def __enter__(self) -> torch.nn.Module: logger.debug( f"{num_torch_module} torch modules are in subgraphs. Using wrapper module for cuda graphs" ) - return WrapperTorchTensorRTModule(self.compiled_module) + return CudaGraphsTorchTensorRTModule(self.compiled_module) else: if num_trt_module > 0: logger.debug( diff --git a/py/torch_tensorrt/runtime/_weight_streaming.py b/py/torch_tensorrt/runtime/_weight_streaming.py index 4ec7fb02c5..8ae5dc1841 100755 --- a/py/torch_tensorrt/runtime/_weight_streaming.py +++ b/py/torch_tensorrt/runtime/_weight_streaming.py @@ -3,8 +3,8 @@ import torch from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule -from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import ( - WrapperTorchTensorRTModule, +from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import ( + CudaGraphsTorchTensorRTModule, ) logger = logging.getLogger(__name__) @@ -16,12 +16,12 @@ class _WeightStreamingContextManager(object): """ def __init__( - self, module: torch.fx.GraphModule | WrapperTorchTensorRTModule + self, module: torch.fx.GraphModule | CudaGraphsTorchTensorRTModule ) -> None: rt_mods = [] self.current_device_budget = 0 - if isinstance(module, WrapperTorchTensorRTModule): + if isinstance(module, CudaGraphsTorchTensorRTModule): module = module.compiled_module for name, rt_mod in module.named_children(): if "_run_on_acc" in name and isinstance( diff --git a/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py b/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py deleted file mode 100644 index 7ea89e1e12..0000000000 --- a/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py +++ /dev/null @@ -1,205 +0,0 @@ -import torch -import torch_tensorrt as torchtrt -from parameterized import parameterized -from torch.testing._internal.common_utils import TestCase, run_tests -from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import ( - WrapperTorchTensorRTModule, -) - -INPUT_SIZE = (3, 16, 16) -TRIALS = 5 - - -class TestWrapperCudagraphs(TestCase): - @parameterized.expand( - [ - ("python_runtime", True, False), - ("python_runtime_multi_out", True, True), - ("cpp_runtime", False, False), - ("cpp_runtime_multi_out", False, True), - ] - ) - def test_wrapper_cudagraphs(self, _, use_python_runtime, multi_output): - class SampleModel(torch.nn.Module): - def forward(self, x): - return torch.relu((x + 2) * 0.5) - - class SampleModelMultiOutput(torch.nn.Module): - def forward(self, x): - return torch.relu((x + 2) * 0.5), torch.relu((x - 2) * 2.1) - - input_list = [] - for _ in range(TRIALS): - input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()] - input_list.append(input) - - model = SampleModel() if multi_output else SampleModelMultiOutput() - fx_graph = torch.fx.symbolic_trace(model) - - # Validate that the results between Torch and Torch-TRT are similar - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - input_list[0], - min_block_size=1, - pass_through_build_failures=True, - torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=use_python_runtime, - ) - - ref_out_list = [] - trt_out_list = [] - for enable_cuda_graphs in [False, True]: - for i in range(len(input_list)): - # Toggles cuda graph at all index in TRIALS - if i % TRIALS == i // TRIALS: - cuda_graphs = enable_cuda_graphs - else: - cuda_graphs = not enable_cuda_graphs - - if cuda_graphs: - with torchtrt.runtime.enable_cudagraphs( - optimized_model - ) as cudagraphs_module: - trt_out_list.append(cudagraphs_module(*input_list[i])) - else: - torchtrt.runtime.set_cudagraphs_mode(False) - trt_out_list.append(optimized_model(*input_list[i])) - - ref_out_list.append(fx_graph(*input_list[i])) - - for optimized_model_results, torch_model_results in zip( - trt_out_list, ref_out_list - ): - torch.testing.assert_close( - torch_model_results, - optimized_model_results, - rtol=5e-03, - atol=5e-03, - equal_nan=True, - check_dtype=True, - ) - torch._dynamo.reset() - - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_wrapper_cudagraphs_dynamic(self, _, use_python_runtime): - class SampleModel(torch.nn.Module): - def forward(self, x): - return torch.relu((x + 2) * 0.5) - - inputs = torchtrt.Input( - min_shape=(1, 3, 128, 224), - opt_shape=(8, 3, 192, 224), - max_shape=(16, 3, 224, 224), - dtype=torch.float, - name="x", - ) - fx_graph = torch.fx.symbolic_trace(SampleModel()) - - optimized_model = torchtrt.compile( - fx_graph, - "dynamo", - inputs, - min_block_size=1, - pass_through_build_failures=True, - torch_executed_ops={"torch.ops.aten.mul.Tensor"}, - use_python_runtime=use_python_runtime, - ) - - input_list = [] - ref_out_list = [] - trt_out_list = [] - # Alternating cuda_graphs enable and input shapes at every five iterations. - for i in [1, 3, 8, 11, 16]: - for j in [128, 128, 222, 222, 224]: - input_list.append(torch.randn((i, 3, j, 224)).cuda()) - - for enable_cuda_graphs in [False, True]: - for i in range(len(input_list)): - # Toggles cuda graph at all index in TRIALS - if i % TRIALS == i // TRIALS: - cuda_graphs = enable_cuda_graphs - else: - cuda_graphs = not enable_cuda_graphs - - if cuda_graphs: - with torchtrt.runtime.enable_cudagraphs( - optimized_model - ) as cudagraphs_module: - trt_out_list.append(cudagraphs_module(input_list[i])) - else: - torchtrt.runtime.set_cudagraphs_mode(False) - trt_out_list.append(optimized_model(input_list[i])) - trt_out_list.append(fx_graph(input_list[i])) - - for optimized_model_results, torch_model_results in zip( - trt_out_list, ref_out_list - ): - torch.testing.assert_close( - torch_model_results, - optimized_model_results, - rtol=5e-03, - atol=5e-03, - equal_nan=True, - check_dtype=True, - ) - torch._dynamo.reset() - - @parameterized.expand( - [ - ("python_runtime", True), - ("cpp_runtime", False), - ] - ) - def test_wrapper_cudagraphs_api(self, _, use_python_runtime): - class SampleModel(torch.nn.Module): - def forward(self, x): - return torch.relu((x + 2) * 0.5) - - model = SampleModel().eval().cuda() - input_list = [] - trt_out_list = [] - ref_out_list = [] - - for _ in range(TRIALS): - input = [torch.randn((64, 32), dtype=torch.float32).cuda()] - input_list.append(input) - fx_graph = torch.fx.symbolic_trace(model) - - optimized_model = torchtrt.compile( - fx_graph, - inputs=input_list[0], - ir="dynamo", - min_block_size=1, - cache_built_engines=False, - reuse_cached_engines=False, - torch_executed_ops={"torch.ops.aten.convolution.default"}, - use_python_runtime=use_python_runtime, - ) - - with torchtrt.runtime.enable_cudagraphs(optimized_model) as cudagraphs_module: - for i in range(TRIALS): - trt_out_list.append(cudagraphs_module(*input_list[i])) - ref_out_list.append(fx_graph(*input_list[i])) - - for optimized_model_results, torch_model_results in zip( - trt_out_list, ref_out_list - ): - torch.testing.assert_close( - torch_model_results, - optimized_model_results, - rtol=5e-03, - atol=5e-03, - equal_nan=True, - check_dtype=True, - ) - torch._dynamo.reset() - - -if __name__ == "__main__": - run_tests()