diff --git a/examples/dynamo/cudagraphs_wrapper_example.py b/examples/dynamo/cudagraphs_wrapper_example.py
deleted file mode 100644
index 20e19a72ce..0000000000
--- a/examples/dynamo/cudagraphs_wrapper_example.py
+++ /dev/null
@@ -1,111 +0,0 @@
-"""
-.. _cudagraphs_wrapper_example:
-
-Wrapped runtime module for cuda graphs
-======================================
-
-If Torch-TensorRT encounters unsupported operations during compilation, it can fall back to using
-PyTorch's native implementation for those specific operations. This fallback mechanism allows the
-rest of the model to be executed using TensorRT, while only the unsupported parts are handled by PyTorch.
-This fallback results in a graph break, which can reduce the overall performance benefits of using
-TensorRT because it introduces additional overhead from switching between TensorRT and PyTorch execution contexts
-
-Applying CUDA Graphs to a PyTorch module that contains graph breaks can enhance performance by leveraging
-the benefits of CUDA Graphs even in the presence of these breaks. Torch-TensorRT provides
-wrapper runtime module with CUDA Graphs for modules that have graph breaks allows you to mitigate the
-inefficiencies introduced by these breaks
-"""
-
-# %%
-# Imports and Model Definition
-# ----------------------------------
-
-import torch
-import torch_tensorrt
-
-
-class SampleModel(torch.nn.Module):
-    def forward(self, x):
-        return torch.relu((x + 2) * 0.5)
-
-
-model = SampleModel().eval().cuda()
-input = torch.randn((1, 3, 224, 224)).to("cuda")
-
-# %%
-# Compiler options
-# ----------------------------------
-#
-# The 'torch_executed_ops' compiler option is used to demonstrate the module with graph breaks for this example.
-# debug=True compiler option provides detailed insights into the compilation process and helps
-# pinpoint where graph breaks occur
-
-# Create a TensorRT-compiled model
-trt_model = torch_tensorrt.compile(
-    model,
-    ir="dynamo",
-    inputs=[input],
-    min_block_size=1,
-    pass_through_build_failures=True,
-)
-
-trt_model_with_graph_break = torch_tensorrt.compile(
-    model,
-    ir="dynamo",
-    inputs=[input],
-    min_block_size=1,
-    pass_through_build_failures=True,
-    debug=True,
-    torch_executed_ops={"torch.ops.aten.mul.Tensor"},
-)
-
-# %%
-# Compiler log
-# ----------------------------------
-#
-# This compiler log indicates torch.ops.aten.mul.Tensor operator is executed by PyTorch.
-# Peformance of this module can be enhanced by using wrapped module.
-
-##############################################################################
-# .. code-block:: none
-#
-#        ++++++++++++++ Dry-Run Results for Graph +++++++++++++++++
-#
-#        The graph consists of 3 Total Operators, of which 2 operators are supported, 66.67% coverage
-#
-#        The following ops are currently unsupported or excluded from conversion, and are listed with their op-count in the graph:
-#        torch.ops.aten.mul.Tensor: 1
-#
-#        The following nodes are currently set to run in Torch:
-#        Node: torch.ops.aten.mul.Tensor, with layer location: /mul
-#        Note: Some of the above nodes may be supported, but were not included in a TRT graph by the partitioner
-
-# %%
-# trt module with cuda graphs
-# ----------------------------------
-#
-
-with torch_tensorrt.runtime.enable_cudagraphs(trt_model) as cudagraphs_module:
-    cudagraphs_module(input)
-
-# %%
-# Running wrapped module with cuda graphs
-# ----------------------------------
-#
-# When CUDA Graphs are applied to a TensorRT model that contains graph breaks, each break introduces additional
-# overhead. This occurs because graph breaks prevent the entire model from being executed as a single, continuous
-# optimized unit. As a result, some of the performance benefits typically provided by CUDA Graphs, such as reduced
-# kernel launch overhead and improved execution efficiency, may be diminished.
-# Using a wrapped runtime module with CUDA Graphs allows you to encapsulate sequences of operations into graphs
-# that can be executed efficiently, even in the presence of graph breaks. When a CUDA Graph context manager is
-# used with the TensorRT module as a positional argument, it returns a wrapped_module. This module captures the
-# execution graph, enabling efficient replay during subsequent inferences by reducing kernel launch overheads
-# and improving performance. Note that initializing with the wrapper module involves a warm-up phase where the
-# module is executed several times. This warm-up ensures that memory allocations and initializations are not
-# recorded in CUDA Graphs, which helps maintain consistent execution paths and optimize performance.
-with torch_tensorrt.runtime.enable_cudagraphs(
-    trt_model_with_graph_break
-) as cudagraphs_module:
-    cudagraphs_module(input)
-
-# %%
diff --git a/py/torch_tensorrt/_compile.py b/py/torch_tensorrt/_compile.py
index 855c75a057..302928a784 100644
--- a/py/torch_tensorrt/_compile.py
+++ b/py/torch_tensorrt/_compile.py
@@ -12,8 +12,8 @@
 from torch_tensorrt._features import ENABLED_FEATURES
 from torch_tensorrt._Input import Input
 from torch_tensorrt.dynamo import _defaults
-from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import (
-    WrapperTorchTensorRTModule,
+from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import (
+    CudaGraphsTorchTensorRTModule,
 )
 from torch_tensorrt.fx import InputTensorSpec
 from torch_tensorrt.fx.lower import compile as fx_compile
@@ -589,7 +589,7 @@ def save(
     Save the model to disk in the specified output format.
 
     Arguments:
-        module (Optional(torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule | WrapperTorchTensorRTModule)): Compiled Torch-TensorRT module
+        module (Optional(torch.jit.ScriptModule | torch.export.ExportedProgram | torch.fx.GraphModule | CudaGraphsTorchTensorRTModule)): Compiled Torch-TensorRT module
         inputs (torch.Tensor): Torch input tensors
         arg_inputs (Tuple[Any, ...]): Same as inputs. Alias for better understanding with kwarg_inputs.
         kwarg_inputs (dict[Any, ...]): Optional, kwarg inputs to the module forward function.
@@ -597,7 +597,7 @@ def save(
         retrace (bool): When the module type is a fx.GraphModule, this option re-exports the graph using torch.export.export(strict=False) to save it.
                 This flag is experimental for now.
     """
-    if isinstance(module, WrapperTorchTensorRTModule):
+    if isinstance(module, CudaGraphsTorchTensorRTModule):
         module = module.compiled_module
     module_type = _parse_module_type(module)
     accepted_formats = {"exported_program", "torchscript"}
diff --git a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
similarity index 97%
rename from py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py
rename to py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
index 86133b43c2..e7afeef398 100644
--- a/py/torch_tensorrt/dynamo/runtime/_WrapperTorchTensorRTModule.py
+++ b/py/torch_tensorrt/dynamo/runtime/_CudaGraphsTorchTensorRTModule.py
@@ -11,7 +11,7 @@
 logger = logging.getLogger(__name__)
 
 
-class WrapperTorchTensorRTModule(torch.nn.Module):  # type: ignore[misc]
+class CudaGraphsTorchTensorRTModule(torch.nn.Module):  # type: ignore[misc]
     """This Wrapper runtime module is to record/replay whole cuda graph in sub modules
 
     Args:
@@ -24,7 +24,7 @@ def __init__(
         self,
         compiled_module: torch.nn.Module,
     ):
-        super(WrapperTorchTensorRTModule, self).__init__()
+        super(CudaGraphsTorchTensorRTModule, self).__init__()
         self.compiled_module = compiled_module
         self.inputs = partitioning.construct_submodule_inputs(compiled_module)
 
diff --git a/py/torch_tensorrt/runtime/_cudagraphs.py b/py/torch_tensorrt/runtime/_cudagraphs.py
index e9893ec64e..6bdc07cbce 100644
--- a/py/torch_tensorrt/runtime/_cudagraphs.py
+++ b/py/torch_tensorrt/runtime/_cudagraphs.py
@@ -3,8 +3,8 @@
 
 import torch
 import torch_tensorrt
-from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import (
-    WrapperTorchTensorRTModule,
+from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import (
+    CudaGraphsTorchTensorRTModule,
 )
 
 
@@ -90,7 +90,7 @@ def __enter__(self) -> torch.nn.Module:
             logger.debug(
                 f"{num_torch_module} torch modules are in subgraphs. Using wrapper module for cuda graphs"
             )
-            return WrapperTorchTensorRTModule(self.compiled_module)
+            return CudaGraphsTorchTensorRTModule(self.compiled_module)
         else:
             if num_trt_module > 0:
                 logger.debug(
diff --git a/py/torch_tensorrt/runtime/_weight_streaming.py b/py/torch_tensorrt/runtime/_weight_streaming.py
index 4ec7fb02c5..8ae5dc1841 100755
--- a/py/torch_tensorrt/runtime/_weight_streaming.py
+++ b/py/torch_tensorrt/runtime/_weight_streaming.py
@@ -3,8 +3,8 @@
 
 import torch
 from torch_tensorrt.dynamo.runtime import PythonTorchTensorRTModule, TorchTensorRTModule
-from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import (
-    WrapperTorchTensorRTModule,
+from torch_tensorrt.dynamo.runtime._CudaGraphsTorchTensorRTModule import (
+    CudaGraphsTorchTensorRTModule,
 )
 
 logger = logging.getLogger(__name__)
@@ -16,12 +16,12 @@ class _WeightStreamingContextManager(object):
     """
 
     def __init__(
-        self, module: torch.fx.GraphModule | WrapperTorchTensorRTModule
+        self, module: torch.fx.GraphModule | CudaGraphsTorchTensorRTModule
     ) -> None:
         rt_mods = []
         self.current_device_budget = 0
 
-        if isinstance(module, WrapperTorchTensorRTModule):
+        if isinstance(module, CudaGraphsTorchTensorRTModule):
             module = module.compiled_module
         for name, rt_mod in module.named_children():
             if "_run_on_acc" in name and isinstance(
diff --git a/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py b/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py
deleted file mode 100644
index 7ea89e1e12..0000000000
--- a/tests/py/dynamo/runtime/test_005_wrapper_cudagraphs.py
+++ /dev/null
@@ -1,205 +0,0 @@
-import torch
-import torch_tensorrt as torchtrt
-from parameterized import parameterized
-from torch.testing._internal.common_utils import TestCase, run_tests
-from torch_tensorrt.dynamo.runtime._WrapperTorchTensorRTModule import (
-    WrapperTorchTensorRTModule,
-)
-
-INPUT_SIZE = (3, 16, 16)
-TRIALS = 5
-
-
-class TestWrapperCudagraphs(TestCase):
-    @parameterized.expand(
-        [
-            ("python_runtime", True, False),
-            ("python_runtime_multi_out", True, True),
-            ("cpp_runtime", False, False),
-            ("cpp_runtime_multi_out", False, True),
-        ]
-    )
-    def test_wrapper_cudagraphs(self, _, use_python_runtime, multi_output):
-        class SampleModel(torch.nn.Module):
-            def forward(self, x):
-                return torch.relu((x + 2) * 0.5)
-
-        class SampleModelMultiOutput(torch.nn.Module):
-            def forward(self, x):
-                return torch.relu((x + 2) * 0.5), torch.relu((x - 2) * 2.1)
-
-        input_list = []
-        for _ in range(TRIALS):
-            input = [torch.randn(*INPUT_SIZE, dtype=torch.float32).cuda()]
-            input_list.append(input)
-
-        model = SampleModel() if multi_output else SampleModelMultiOutput()
-        fx_graph = torch.fx.symbolic_trace(model)
-
-        # Validate that the results between Torch and Torch-TRT are similar
-        optimized_model = torchtrt.compile(
-            fx_graph,
-            "dynamo",
-            input_list[0],
-            min_block_size=1,
-            pass_through_build_failures=True,
-            torch_executed_ops={"torch.ops.aten.mul.Tensor"},
-            use_python_runtime=use_python_runtime,
-        )
-
-        ref_out_list = []
-        trt_out_list = []
-        for enable_cuda_graphs in [False, True]:
-            for i in range(len(input_list)):
-                # Toggles cuda graph at all index in TRIALS
-                if i % TRIALS == i // TRIALS:
-                    cuda_graphs = enable_cuda_graphs
-                else:
-                    cuda_graphs = not enable_cuda_graphs
-
-                if cuda_graphs:
-                    with torchtrt.runtime.enable_cudagraphs(
-                        optimized_model
-                    ) as cudagraphs_module:
-                        trt_out_list.append(cudagraphs_module(*input_list[i]))
-                else:
-                    torchtrt.runtime.set_cudagraphs_mode(False)
-                    trt_out_list.append(optimized_model(*input_list[i]))
-
-                ref_out_list.append(fx_graph(*input_list[i]))
-
-        for optimized_model_results, torch_model_results in zip(
-            trt_out_list, ref_out_list
-        ):
-            torch.testing.assert_close(
-                torch_model_results,
-                optimized_model_results,
-                rtol=5e-03,
-                atol=5e-03,
-                equal_nan=True,
-                check_dtype=True,
-            )
-        torch._dynamo.reset()
-
-    @parameterized.expand(
-        [
-            ("python_runtime", True),
-            ("cpp_runtime", False),
-        ]
-    )
-    def test_wrapper_cudagraphs_dynamic(self, _, use_python_runtime):
-        class SampleModel(torch.nn.Module):
-            def forward(self, x):
-                return torch.relu((x + 2) * 0.5)
-
-        inputs = torchtrt.Input(
-            min_shape=(1, 3, 128, 224),
-            opt_shape=(8, 3, 192, 224),
-            max_shape=(16, 3, 224, 224),
-            dtype=torch.float,
-            name="x",
-        )
-        fx_graph = torch.fx.symbolic_trace(SampleModel())
-
-        optimized_model = torchtrt.compile(
-            fx_graph,
-            "dynamo",
-            inputs,
-            min_block_size=1,
-            pass_through_build_failures=True,
-            torch_executed_ops={"torch.ops.aten.mul.Tensor"},
-            use_python_runtime=use_python_runtime,
-        )
-
-        input_list = []
-        ref_out_list = []
-        trt_out_list = []
-        # Alternating cuda_graphs enable and input shapes at every five iterations.
-        for i in [1, 3, 8, 11, 16]:
-            for j in [128, 128, 222, 222, 224]:
-                input_list.append(torch.randn((i, 3, j, 224)).cuda())
-
-        for enable_cuda_graphs in [False, True]:
-            for i in range(len(input_list)):
-                # Toggles cuda graph at all index in TRIALS
-                if i % TRIALS == i // TRIALS:
-                    cuda_graphs = enable_cuda_graphs
-                else:
-                    cuda_graphs = not enable_cuda_graphs
-
-                if cuda_graphs:
-                    with torchtrt.runtime.enable_cudagraphs(
-                        optimized_model
-                    ) as cudagraphs_module:
-                        trt_out_list.append(cudagraphs_module(input_list[i]))
-                else:
-                    torchtrt.runtime.set_cudagraphs_mode(False)
-                    trt_out_list.append(optimized_model(input_list[i]))
-                trt_out_list.append(fx_graph(input_list[i]))
-
-        for optimized_model_results, torch_model_results in zip(
-            trt_out_list, ref_out_list
-        ):
-            torch.testing.assert_close(
-                torch_model_results,
-                optimized_model_results,
-                rtol=5e-03,
-                atol=5e-03,
-                equal_nan=True,
-                check_dtype=True,
-            )
-        torch._dynamo.reset()
-
-    @parameterized.expand(
-        [
-            ("python_runtime", True),
-            ("cpp_runtime", False),
-        ]
-    )
-    def test_wrapper_cudagraphs_api(self, _, use_python_runtime):
-        class SampleModel(torch.nn.Module):
-            def forward(self, x):
-                return torch.relu((x + 2) * 0.5)
-
-        model = SampleModel().eval().cuda()
-        input_list = []
-        trt_out_list = []
-        ref_out_list = []
-
-        for _ in range(TRIALS):
-            input = [torch.randn((64, 32), dtype=torch.float32).cuda()]
-            input_list.append(input)
-        fx_graph = torch.fx.symbolic_trace(model)
-
-        optimized_model = torchtrt.compile(
-            fx_graph,
-            inputs=input_list[0],
-            ir="dynamo",
-            min_block_size=1,
-            cache_built_engines=False,
-            reuse_cached_engines=False,
-            torch_executed_ops={"torch.ops.aten.convolution.default"},
-            use_python_runtime=use_python_runtime,
-        )
-
-        with torchtrt.runtime.enable_cudagraphs(optimized_model) as cudagraphs_module:
-            for i in range(TRIALS):
-                trt_out_list.append(cudagraphs_module(*input_list[i]))
-                ref_out_list.append(fx_graph(*input_list[i]))
-
-        for optimized_model_results, torch_model_results in zip(
-            trt_out_list, ref_out_list
-        ):
-            torch.testing.assert_close(
-                torch_model_results,
-                optimized_model_results,
-                rtol=5e-03,
-                atol=5e-03,
-                equal_nan=True,
-                check_dtype=True,
-            )
-        torch._dynamo.reset()
-
-
-if __name__ == "__main__":
-    run_tests()