From 64baa52ebb2463328e4b6373cf5f8cda67d4ec45 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Mon, 9 Sep 2024 16:19:00 -0700
Subject: [PATCH 01/12] chore: updates

---
 examples/dynamo/utils.py                     | 9 ++++++++-
 py/torch_tensorrt/dynamo/_compiler.py        | 1 +
 py/torch_tensorrt/dynamo/backend/backends.py | 4 ++--
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py
index 25ad99c12d..90f1f3b72c 100644
--- a/examples/dynamo/utils.py
+++ b/examples/dynamo/utils.py
@@ -51,7 +51,14 @@ def generate(model, input_seq, max_tokens, eos_token_id):
     )
 
     while True:
-        outputs = model(input_seq)
+        outputs = model(
+            input_seq,
+            past_key_values=None,
+            position_ids=None,
+            attention_mask=None,
+            use_cache=False,
+            token_type_ids=None,
+        )
         logits = outputs.logits
         next_token_logits = logits[:, -1, :]
         next_tokens = torch.argmax(next_token_logits, dim=-1)
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index 6cd3cf5f5f..faeee1ff29 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -288,6 +288,7 @@ def compile(
     trt_gm = compile_module(
         gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
     )
+
     return trt_gm
 
 
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
index 605d963a50..6570d6603a 100644
--- a/py/torch_tensorrt/dynamo/backend/backends.py
+++ b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -87,11 +87,11 @@ def _pretraced_backend(
 
             # Remove detach nodes
             remove_detach(gm)
-
+            # breakpoint()
             # Invoke AOTAutograd to translate operators to aten
             gm = aot_export_joint_simple(
                 gm,
-                torch_inputs,
+                sample_inputs,
                 trace_joint=False,
                 decompositions=get_decompositions(
                     settings.enable_experimental_decompositions

From c5d3022dc8ae3b1e32db92b3d51e587bfce74aad Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Thu, 19 Sep 2024 14:25:52 -0700
Subject: [PATCH 02/12] chore: use HF generate instead of custom generate

---
 examples/dynamo/torch_compile_gpt2.py         | 108 ++++++++++++++++++
 examples/dynamo/torch_compile_llama2.py       |  89 +++++++++++++++
 py/torch_tensorrt/dynamo/backend/backends.py  |   5 +-
 .../dynamo/lowering/_remove_sym_nodes.py      |  14 ++-
 4 files changed, 209 insertions(+), 7 deletions(-)
 create mode 100644 examples/dynamo/torch_compile_gpt2.py
 create mode 100644 examples/dynamo/torch_compile_llama2.py

diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py
new file mode 100644
index 0000000000..fb08f18882
--- /dev/null
+++ b/examples/dynamo/torch_compile_gpt2.py
@@ -0,0 +1,108 @@
+"""
+.. _torch_compile_gpt2:
+
+Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
+==========================================================
+
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+import torch
+import torch_tensorrt
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# %%
+
+# Define the parameters
+MAX_TOKENS = 32
+DEVICE = torch.device("cuda:0")
+
+# Define the GPT2 model from hugging face
+# kv_cache is not supported in Torch-TRT currently.
+# CPU is used here so that GPU memory is reserved for TRT compilation.
+with torch.no_grad():
+    tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    model = (
+        AutoModelForCausalLM.from_pretrained(
+            "gpt2",
+            pad_token_id=tokenizer.eos_token_id,
+            use_cache=False,
+            attn_implementation="eager",
+        )
+        .eval()
+        .cuda()
+    )
+
+# %%
+# Tokenize a sample input prompt and get pytorch model outputs
+prompt = "I enjoy walking with my cute dog"
+model_inputs = tokenizer(prompt, return_tensors="pt")
+input_ids = model_inputs["input_ids"].cuda()
+position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).cuda()
+attention_mask = torch.ones_like(position_ids)
+
+
+# Auto-regressive generation loop for greedy search using PyTorch model.
+pyt_gen_tokens = model.generate(
+    input_ids,
+    max_length=MAX_TOKENS,
+    use_cache=False,
+    pad_token_id=tokenizer.eos_token_id,
+)
+
+# %%
+# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# Compile the model and mark the input sequence length to be dynamic
+torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
+torch._dynamo.mark_dynamic(position_ids, 1, min=2, max=1023)
+torch._dynamo.mark_dynamic(attention_mask, 1, min=2, max=1023)
+trt_model = torch.compile(
+    model,
+    backend="tensorrt",
+    dynamic=None,
+    options={
+        "enabled_precisions": {torch.float32},
+        "disable_tf32": True,
+        "min_block_size": 1,
+    },
+)
+model_inputs = {
+    "input_ids": input_ids,
+    "position_ids": position_ids,
+    "attention_mask": attention_mask,
+}
+trt_model(input_ids)
+
+# Auto-regressive generation loop for greedy decoding using TensorRT model
+trt_gen_tokens = trt_model.generate(
+    input_ids,
+    max_length=MAX_TOKENS,
+    use_cache=False,
+    pad_token_id=tokenizer.eos_token_id,
+)
+
+# %%
+# Decode the output sentences of PyTorch and TensorRT
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+print("=============================")
+print(
+    "Pytorch model generated text: ",
+    tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
+)
+print("=============================")
+print(
+    "TensorRT model generated text: ",
+    tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
+)
+
+# %%
+# The output sentences should look like
+
+# Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
+# =============================
+# TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
diff --git a/examples/dynamo/torch_compile_llama2.py b/examples/dynamo/torch_compile_llama2.py
new file mode 100644
index 0000000000..40ddc97d2c
--- /dev/null
+++ b/examples/dynamo/torch_compile_llama2.py
@@ -0,0 +1,89 @@
+"""
+.. _torch_compile_gpt2:
+
+Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
+==========================================================
+
+This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""
+
+# %%
+# Imports and Model Definition
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+import torch
+import torch_tensorrt
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from utils import generate
+
+# %%
+
+# Define the parameters
+MAX_TOKENS = 32
+DEVICE = torch.device("cuda:0")
+
+# Define the GPT2 model from hugging face
+# kv_cache is not supported in Torch-TRT currently.
+# CPU is used here so that GPU memory is reserved for TRT compilation.
+llama_path = "meta-llama/Llama-2-7b-chat-hf"
+with torch.no_grad():
+    model = AutoModelForCausalLM.from_pretrained(
+        llama_path, use_cache=False, attn_implementation="eager"
+    ).eval()
+
+tokenizer = AutoTokenizer.from_pretrained(llama_path)
+
+# %%
+# Tokenize a sample input prompt and get pytorch model outputs
+prompt = "I enjoy walking with my cute dog"
+model_inputs = tokenizer(prompt, return_tensors="pt")
+input_ids = model_inputs["input_ids"].cuda()
+
+# Auto-regressive generation loop for greedy search using PyTorch model.
+# We use a custom generate function which is very similar to the huggingface one.
+# pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+# %%
+# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+# Compile the model and mark the input sequence length to be dynamic
+with torch_tensorrt.logging.debug():
+    torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023)
+    model.forward = torch.compile(
+        model.forward,
+        backend="tensorrt",
+        dynamic=None,
+        options={
+            "enabled_precisions": {torch.float32},
+            "disable_tf32": True,
+            "debug": True,
+            # "use_python_runtime": True
+        },
+    )
+model(input_ids)
+breakpoint()
+model(input_ids)
+# Auto-regressive generation loop for greedy decoding using TensorRT model
+# We use a custom generate function which is very similar to the huggingface one.
+# Move inputs to GPU
+input_ids = input_ids.to(DEVICE)
+trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
+
+# %%
+# Decode the output sentences of PyTorch and TensorRT
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+print("=============================")
+print(
+    "Pytorch model generated text: ",
+    tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
+)
+print("=============================")
+print(
+    "TensorRT model generated text: ",
+    tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
+)
+
+# %%
+# The output sentences should look like
+#
+#
diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py
index 6570d6603a..02cc6242aa 100644
--- a/py/torch_tensorrt/dynamo/backend/backends.py
+++ b/py/torch_tensorrt/dynamo/backend/backends.py
@@ -80,14 +80,15 @@ def _pretraced_backend(
             repair_input_aliasing(gm)
 
             # Remove sym_int placeholders and inputs
-            remove_sym_nodes(gm)
+            remove_sym_nodes(gm, sample_inputs)
+
             torch_inputs = [
                 input for input in sample_inputs if isinstance(input, torch.Tensor)
             ]
 
             # Remove detach nodes
             remove_detach(gm)
-            # breakpoint()
+
             # Invoke AOTAutograd to translate operators to aten
             gm = aot_export_joint_simple(
                 gm,
diff --git a/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py b/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py
index 8adebc87f8..0042012761 100644
--- a/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py
+++ b/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py
@@ -1,18 +1,21 @@
 import logging
+from typing import Any, Sequence
 
 import torch
 
 logger = logging.getLogger(__name__)
 
 
-def remove_sym_nodes(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
+def remove_sym_nodes(
+    gm: torch.fx.GraphModule, sample_inputs: Sequence[Any]
+) -> torch.fx.GraphModule:
     """Remove sym_int placeholders which get inserted due to torch.compile's
     dynamic=True behavior
     """
     # Extract SymInt placeholder Tensors
-    placeholder_sym_ints = [
-        node
-        for node in gm.graph.nodes
+    placeholder_idx_sym_ints = [
+        (idx, node)
+        for idx, node in enumerate(gm.graph.nodes)
         if (
             node.op == "placeholder"
             and isinstance(node.type, type)
@@ -21,8 +24,9 @@ def remove_sym_nodes(gm: torch.fx.GraphModule) -> torch.fx.GraphModule:
         )
     ]
 
-    for node in placeholder_sym_ints:
+    for idx, node in placeholder_idx_sym_ints:
         gm.graph.erase_node(node)
+        sample_inputs.pop(idx)
 
     gm.graph.lint()
     gm.recompile()

From c4f8945b0a537374c13a1dca4316f8bc60f0950d Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Mon, 23 Sep 2024 13:54:11 -0700
Subject: [PATCH 03/12] chore: updates

---
 examples/dynamo/torch_compile_gpt2.py | 53 ++++++++++++---------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py
index fb08f18882..9c81b4b45a 100644
--- a/examples/dynamo/torch_compile_gpt2.py
+++ b/examples/dynamo/torch_compile_gpt2.py
@@ -40,9 +40,6 @@
 prompt = "I enjoy walking with my cute dog"
 model_inputs = tokenizer(prompt, return_tensors="pt")
 input_ids = model_inputs["input_ids"].cuda()
-position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).cuda()
-attention_mask = torch.ones_like(position_ids)
-
 
 # Auto-regressive generation loop for greedy search using PyTorch model.
 pyt_gen_tokens = model.generate(
@@ -56,34 +53,30 @@
 # Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-# Compile the model and mark the input sequence length to be dynamic
-torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
-torch._dynamo.mark_dynamic(position_ids, 1, min=2, max=1023)
-torch._dynamo.mark_dynamic(attention_mask, 1, min=2, max=1023)
-trt_model = torch.compile(
-    model,
-    backend="tensorrt",
-    dynamic=None,
-    options={
-        "enabled_precisions": {torch.float32},
-        "disable_tf32": True,
-        "min_block_size": 1,
-    },
-)
-model_inputs = {
-    "input_ids": input_ids,
-    "position_ids": position_ids,
-    "attention_mask": attention_mask,
-}
-trt_model(input_ids)
+with torch_tensorrt.logging.debug():
+    # Compile the model and mark the input sequence length to be dynamic
+    torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
+    model.forward = torch.compile(
+        model.forward,
+        backend="tensorrt",
+        dynamic=None,
+        options={
+            "enabled_precisions": {torch.float32},
+            "disable_tf32": True,
+            "min_block_size": 1,
+            "debug": True,
+        },
+    )
 
-# Auto-regressive generation loop for greedy decoding using TensorRT model
-trt_gen_tokens = trt_model.generate(
-    input_ids,
-    max_length=MAX_TOKENS,
-    use_cache=False,
-    pad_token_id=tokenizer.eos_token_id,
-)
+    # Auto-regressive generation loop for greedy decoding using TensorRT model
+    # The first token generation compiles the model using TensorRT and the second token
+    # encounters recompilation
+    trt_gen_tokens = model.generate(
+        inputs=input_ids,
+        max_length=MAX_TOKENS,
+        use_cache=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
 
 # %%
 # Decode the output sentences of PyTorch and TensorRT

From d5246f934187af968298629af2d42a011bace7b4 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Tue, 24 Sep 2024 10:40:08 -0700
Subject: [PATCH 04/12] chore: updates

---
 docsrc/index.rst                      |  2 ++
 examples/dynamo/README.rst            | 31 +++++++++++++-----
 examples/dynamo/requirements.txt      |  4 +--
 examples/dynamo/torch_compile_gpt2.py | 45 +++++++++++++--------------
 4 files changed, 50 insertions(+), 32 deletions(-)

diff --git a/docsrc/index.rst b/docsrc/index.rst
index 757acc2011..0bef2f0664 100644
--- a/docsrc/index.rst
+++ b/docsrc/index.rst
@@ -118,6 +118,8 @@ Tutorials
    tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2
    tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion
    tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example
+   tutorials/_rendered_examples/dynamo/torch_export_gpt2
+   tutorials/_rendered_examples/dynamo/torch_export_llama2
 
 Python API Documentation
 ------------------------
diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst
index ff3563cffe..6be2aa6515 100644
--- a/examples/dynamo/README.rst
+++ b/examples/dynamo/README.rst
@@ -1,15 +1,24 @@
 .. _torch_compile:
 
-Dynamo / ``torch.compile``
-----------------------------
+Torch-TensorRT Examples
+====================================
 
-Torch-TensorRT provides a backend for the new ``torch.compile`` API released in PyTorch 2.0. In the following examples we describe
-a number of ways you can leverage this backend to accelerate inference.
+Please refer to the following examples which demonstrate the usage of different features of Torch-TensorRT. We also provide
+examples of Torch-TensorRT compilation of select computer vision and language models.
 
-* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile``
-* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
+Dependencies
+------------------------------------
+
+Please install the following external depencies (assuming you already have `torch_tensorrt` installed)
+
+.. code-block:: python
+
+    pip install -r requirements.txt
+
+
+Compiler Features
+------------------------------------
 * :ref:`torch_compile_advanced_usage`: Advanced usage including making a custom backend to use directly with the ``torch.compile`` API
-* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
 * :ref:`torch_export_cudagraphs`: Using the Cudagraphs integration with `ir="dynamo"`
 * :ref:`custom_kernel_plugins`: Creating a plugin to use a custom kernel inside TensorRT engines
 * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights
@@ -17,3 +26,11 @@ a number of ways you can leverage this backend to accelerate inference.
 * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile``
 * :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times
 * :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT
+
+Model Zoo
+------------------------------------
+* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile``
+* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
+* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
+* :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`)
+* :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`)
\ No newline at end of file
diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt
index 6e53935186..41fe29f09c 100644
--- a/examples/dynamo/requirements.txt
+++ b/examples/dynamo/requirements.txt
@@ -1,4 +1,4 @@
 cupy==13.1.0
-torch>=2.4.0.dev20240503+cu121
-torch-tensorrt>=2.4.0.dev20240503+cu121
 triton==2.3.0
+diffusers==0.30.3
+transformers==4.44.2
diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py
index 9c81b4b45a..6c6e1b03a2 100644
--- a/examples/dynamo/torch_compile_gpt2.py
+++ b/examples/dynamo/torch_compile_gpt2.py
@@ -53,30 +53,29 @@
 # Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-with torch_tensorrt.logging.debug():
-    # Compile the model and mark the input sequence length to be dynamic
-    torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
-    model.forward = torch.compile(
-        model.forward,
-        backend="tensorrt",
-        dynamic=None,
-        options={
-            "enabled_precisions": {torch.float32},
-            "disable_tf32": True,
-            "min_block_size": 1,
-            "debug": True,
-        },
-    )
+# Compile the model and mark the input sequence length to be dynamic
+torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
+model.forward = torch.compile(
+    model.forward,
+    backend="tensorrt",
+    dynamic=None,
+    options={
+        "enabled_precisions": {torch.float32},
+        "disable_tf32": True,
+        "min_block_size": 1,
+        "debug": True,
+    },
+)
 
-    # Auto-regressive generation loop for greedy decoding using TensorRT model
-    # The first token generation compiles the model using TensorRT and the second token
-    # encounters recompilation
-    trt_gen_tokens = model.generate(
-        inputs=input_ids,
-        max_length=MAX_TOKENS,
-        use_cache=False,
-        pad_token_id=tokenizer.eos_token_id,
-    )
+# Auto-regressive generation loop for greedy decoding using TensorRT model
+# The first token generation compiles the model using TensorRT and the second token
+# encounters recompilation
+trt_gen_tokens = model.generate(
+    inputs=input_ids,
+    max_length=MAX_TOKENS,
+    use_cache=False,
+    pad_token_id=tokenizer.eos_token_id,
+)
 
 # %%
 # Decode the output sentences of PyTorch and TensorRT

From 5bfc1eca3fee20e5e096b793c469764effcda226 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Mon, 21 Oct 2024 17:27:36 -0700
Subject: [PATCH 05/12] chore: updates

---
 py/requirements.txt                                  | 2 +-
 py/torch_tensorrt/dynamo/lowering/_decompositions.py | 3 ++-
 pyproject.toml                                       | 8 ++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/py/requirements.txt b/py/requirements.txt
index 361afab365..d480ccbd57 100644
--- a/py/requirements.txt
+++ b/py/requirements.txt
@@ -3,6 +3,6 @@ packaging
 pybind11==2.6.2
 --extra-index-url https://download.pytorch.org/whl/nightly/cu124
 torch>=2.6.0.dev,<2.7.0
-torchvision>=0.20.0.dev,<0.21.0
+#torchvision>=0.20.0.dev,<0.21.0
 --extra-index-url https://pypi.ngc.nvidia.com
 pyyaml
diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
index 534bc3eac5..d2bfeb501a 100644
--- a/py/torch_tensorrt/dynamo/lowering/_decompositions.py
+++ b/py/torch_tensorrt/dynamo/lowering/_decompositions.py
@@ -3,7 +3,8 @@
 from typing import Any, Callable, Dict, List, Optional
 
 import torch
-from torch._decomp import _decomp_table_to_post_autograd_aten, register_decomposition
+from torch._decomp import register_decomposition
+from torch._export.utils import _decomp_table_to_post_autograd_aten
 from torch._ops import OpOverload
 from torch_tensorrt.dynamo._defaults import default_device
 from torch_tensorrt.dynamo.conversion.converter_utils import get_positive_dim
diff --git a/pyproject.toml b/pyproject.toml
index 1284e458f4..1ec43f76fe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "cffi>=1.15.1",
     "typing-extensions>=4.7.0",
     "future>=0.18.3",
-    "tensorrt-cu12==10.3.0",
+    #"tensorrt-cu12==10.3.0",
     "torch>=2.6.0.dev,<2.7.0",
     "pybind11==2.6.2",
     "numpy",
@@ -55,9 +55,9 @@ keywords = [
 ]
 dependencies = [
     "torch>=2.6.0.dev,<2.7.0",
-    "tensorrt-cu12==10.3.0",
-    "tensorrt-cu12-bindings==10.3.0",
-    "tensorrt-cu12-libs==10.3.0",
+    #"tensorrt-cu12==10.3.0",
+    #"tensorrt-cu12-bindings==10.3.0",
+    #"tensorrt-cu12-libs==10.3.0",
     "packaging>=23",
     "numpy",
     "typing-extensions>=4.7.0",

From 8cf1c413aafeb49b90177cd0fa2bbc8cc9940b00 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Mon, 21 Oct 2024 18:00:09 -0700
Subject: [PATCH 06/12] chore: updates

---
 docker/Dockerfile.lab | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 docker/Dockerfile.lab

diff --git a/docker/Dockerfile.lab b/docker/Dockerfile.lab
new file mode 100644
index 0000000000..569acfecad
--- /dev/null
+++ b/docker/Dockerfile.lab
@@ -0,0 +1,33 @@
+# syntax=docker/dockerfile:1
+
+# Base image starts with CUDA
+ARG BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04
+FROM ${BASE_IMG} as base
+ENV BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+ARG PYTHON_VERSION=3.10
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+
+ARG USE_CXX11_ABI
+ENV USE_CXX11=${USE_CXX11_ABI}
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install basic dependencies
+RUN apt-get update
+RUN apt install -y vim build-essential manpages-dev wget zlib1g software-properties-common git libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8
+
+# Install PyEnv and desired Python version
+ENV HOME="/root"
+ENV PYENV_DIR="$HOME/.pyenv"
+ENV PATH="$PYENV_DIR/shims:$PYENV_DIR/bin:$PATH"
+RUN wget -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer &&\
+    chmod 755 pyenv-installer &&\
+    bash pyenv-installer &&\
+    eval "$(pyenv init -)"
+
+RUN pyenv install -v ${PYTHON_VERSION}
+RUN pyenv global ${PYTHON_VERSION}
+
+# Setup Bazel via Bazelisk
+RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\
+    chmod a+x /usr/bin/bazel

From 289f84dfeeab492592b0dbeb0aa69c14a5c0fd2a Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Sun, 8 Dec 2024 21:49:42 -0800
Subject: [PATCH 07/12] chore: updates

---
 examples/dynamo/torch_export_gpt2.py | 22 ++++++++++++----------
 examples/dynamo/utils.py             |  9 +--------
 2 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py
index cea0f3adf2..c055de5e3c 100644
--- a/examples/dynamo/torch_export_gpt2.py
+++ b/examples/dynamo/torch_export_gpt2.py
@@ -57,16 +57,18 @@
 # 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation
 # 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch)
 gpt2_ep = export_llm(model, input_ids, max_seq_len=1024)
-trt_model = torch_tensorrt.dynamo.compile(
-    gpt2_ep,
-    inputs=[input_ids],
-    enabled_precisions={torch.float32},
-    truncate_double=True,
-    device=DEVICE,
-    disable_tf32=True,
-    use_explicit_typing=True,
-    use_fp32_acc=True,
-)
+with torch_tensorrt.logging.debug():
+    trt_model = torch_tensorrt.dynamo.compile(
+        gpt2_ep,
+        inputs=[input_ids],
+        enabled_precisions={torch.float32},
+        truncate_double=True,
+        device=DEVICE,
+        disable_tf32=True,
+        use_explicit_typing=True,
+        use_fp32_acc=True,
+        debug=True,
+    )
 
 # Auto-regressive generation loop for greedy decoding using TensorRT model
 # We use a custom generate function which is very similar to the huggingface one.
diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py
index 90f1f3b72c..25ad99c12d 100644
--- a/examples/dynamo/utils.py
+++ b/examples/dynamo/utils.py
@@ -51,14 +51,7 @@ def generate(model, input_seq, max_tokens, eos_token_id):
     )
 
     while True:
-        outputs = model(
-            input_seq,
-            past_key_values=None,
-            position_ids=None,
-            attention_mask=None,
-            use_cache=False,
-            token_type_ids=None,
-        )
+        outputs = model(input_seq)
         logits = outputs.logits
         next_token_logits = logits[:, -1, :]
         next_tokens = torch.argmax(next_token_logits, dim=-1)

From cb9409f30a76f1398d66b7bb06ea70dfd44bd9ac Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Sun, 8 Dec 2024 22:56:29 -0800
Subject: [PATCH 08/12] chore: clean up

---
 examples/dynamo/torch_compile_llama2.py | 89 -------------------------
 examples/dynamo/torch_export_gpt2.py    | 22 +++---
 py/requirements.txt                     |  2 +-
 py/torch_tensorrt/dynamo/_compiler.py   |  1 -
 pyproject.toml                          |  8 +--
 5 files changed, 15 insertions(+), 107 deletions(-)
 delete mode 100644 examples/dynamo/torch_compile_llama2.py

diff --git a/examples/dynamo/torch_compile_llama2.py b/examples/dynamo/torch_compile_llama2.py
deleted file mode 100644
index 40ddc97d2c..0000000000
--- a/examples/dynamo/torch_compile_llama2.py
+++ /dev/null
@@ -1,89 +0,0 @@
-"""
-.. _torch_compile_gpt2:
-
-Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
-==========================================================
-
-This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""
-
-# %%
-# Imports and Model Definition
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-import torch
-import torch_tensorrt
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from utils import generate
-
-# %%
-
-# Define the parameters
-MAX_TOKENS = 32
-DEVICE = torch.device("cuda:0")
-
-# Define the GPT2 model from hugging face
-# kv_cache is not supported in Torch-TRT currently.
-# CPU is used here so that GPU memory is reserved for TRT compilation.
-llama_path = "meta-llama/Llama-2-7b-chat-hf"
-with torch.no_grad():
-    model = AutoModelForCausalLM.from_pretrained(
-        llama_path, use_cache=False, attn_implementation="eager"
-    ).eval()
-
-tokenizer = AutoTokenizer.from_pretrained(llama_path)
-
-# %%
-# Tokenize a sample input prompt and get pytorch model outputs
-prompt = "I enjoy walking with my cute dog"
-model_inputs = tokenizer(prompt, return_tensors="pt")
-input_ids = model_inputs["input_ids"].cuda()
-
-# Auto-regressive generation loop for greedy search using PyTorch model.
-# We use a custom generate function which is very similar to the huggingface one.
-# pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
-
-# %%
-# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-# Compile the model and mark the input sequence length to be dynamic
-with torch_tensorrt.logging.debug():
-    torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023)
-    model.forward = torch.compile(
-        model.forward,
-        backend="tensorrt",
-        dynamic=None,
-        options={
-            "enabled_precisions": {torch.float32},
-            "disable_tf32": True,
-            "debug": True,
-            # "use_python_runtime": True
-        },
-    )
-model(input_ids)
-breakpoint()
-model(input_ids)
-# Auto-regressive generation loop for greedy decoding using TensorRT model
-# We use a custom generate function which is very similar to the huggingface one.
-# Move inputs to GPU
-input_ids = input_ids.to(DEVICE)
-trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id)
-
-# %%
-# Decode the output sentences of PyTorch and TensorRT
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-print("=============================")
-print(
-    "Pytorch model generated text: ",
-    tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
-)
-print("=============================")
-print(
-    "TensorRT model generated text: ",
-    tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True),
-)
-
-# %%
-# The output sentences should look like
-#
-#
diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py
index c055de5e3c..cea0f3adf2 100644
--- a/examples/dynamo/torch_export_gpt2.py
+++ b/examples/dynamo/torch_export_gpt2.py
@@ -57,18 +57,16 @@
 # 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation
 # 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch)
 gpt2_ep = export_llm(model, input_ids, max_seq_len=1024)
-with torch_tensorrt.logging.debug():
-    trt_model = torch_tensorrt.dynamo.compile(
-        gpt2_ep,
-        inputs=[input_ids],
-        enabled_precisions={torch.float32},
-        truncate_double=True,
-        device=DEVICE,
-        disable_tf32=True,
-        use_explicit_typing=True,
-        use_fp32_acc=True,
-        debug=True,
-    )
+trt_model = torch_tensorrt.dynamo.compile(
+    gpt2_ep,
+    inputs=[input_ids],
+    enabled_precisions={torch.float32},
+    truncate_double=True,
+    device=DEVICE,
+    disable_tf32=True,
+    use_explicit_typing=True,
+    use_fp32_acc=True,
+)
 
 # Auto-regressive generation loop for greedy decoding using TensorRT model
 # We use a custom generate function which is very similar to the huggingface one.
diff --git a/py/requirements.txt b/py/requirements.txt
index d480ccbd57..361afab365 100644
--- a/py/requirements.txt
+++ b/py/requirements.txt
@@ -3,6 +3,6 @@ packaging
 pybind11==2.6.2
 --extra-index-url https://download.pytorch.org/whl/nightly/cu124
 torch>=2.6.0.dev,<2.7.0
-#torchvision>=0.20.0.dev,<0.21.0
+torchvision>=0.20.0.dev,<0.21.0
 --extra-index-url https://pypi.ngc.nvidia.com
 pyyaml
diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py
index d7792e7464..9859668cd9 100644
--- a/py/torch_tensorrt/dynamo/_compiler.py
+++ b/py/torch_tensorrt/dynamo/_compiler.py
@@ -608,7 +608,6 @@ def compile(
     trt_gm = compile_module(
         gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache
     )
-
     return trt_gm
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 1ec43f76fe..1284e458f4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "cffi>=1.15.1",
     "typing-extensions>=4.7.0",
     "future>=0.18.3",
-    #"tensorrt-cu12==10.3.0",
+    "tensorrt-cu12==10.3.0",
     "torch>=2.6.0.dev,<2.7.0",
     "pybind11==2.6.2",
     "numpy",
@@ -55,9 +55,9 @@ keywords = [
 ]
 dependencies = [
     "torch>=2.6.0.dev,<2.7.0",
-    #"tensorrt-cu12==10.3.0",
-    #"tensorrt-cu12-bindings==10.3.0",
-    #"tensorrt-cu12-libs==10.3.0",
+    "tensorrt-cu12==10.3.0",
+    "tensorrt-cu12-bindings==10.3.0",
+    "tensorrt-cu12-libs==10.3.0",
     "packaging>=23",
     "numpy",
     "typing-extensions>=4.7.0",

From 8725a483015f9fa64383deaabfd645ae9436ab37 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Sun, 8 Dec 2024 22:58:16 -0800
Subject: [PATCH 09/12] chore: remove redundant file

---
 docker/Dockerfile.lab | 33 ---------------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 docker/Dockerfile.lab

diff --git a/docker/Dockerfile.lab b/docker/Dockerfile.lab
deleted file mode 100644
index 569acfecad..0000000000
--- a/docker/Dockerfile.lab
+++ /dev/null
@@ -1,33 +0,0 @@
-# syntax=docker/dockerfile:1
-
-# Base image starts with CUDA
-ARG BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04
-FROM ${BASE_IMG} as base
-ENV BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04
-
-ARG PYTHON_VERSION=3.10
-ENV PYTHON_VERSION=${PYTHON_VERSION}
-
-ARG USE_CXX11_ABI
-ENV USE_CXX11=${USE_CXX11_ABI}
-ENV DEBIAN_FRONTEND=noninteractive
-
-# Install basic dependencies
-RUN apt-get update
-RUN apt install -y vim build-essential manpages-dev wget zlib1g software-properties-common git libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8
-
-# Install PyEnv and desired Python version
-ENV HOME="/root"
-ENV PYENV_DIR="$HOME/.pyenv"
-ENV PATH="$PYENV_DIR/shims:$PYENV_DIR/bin:$PATH"
-RUN wget -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer &&\
-    chmod 755 pyenv-installer &&\
-    bash pyenv-installer &&\
-    eval "$(pyenv init -)"
-
-RUN pyenv install -v ${PYTHON_VERSION}
-RUN pyenv global ${PYTHON_VERSION}
-
-# Setup Bazel via Bazelisk
-RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\
-    chmod a+x /usr/bin/bazel

From 332d32042788dfa16f66620cd66778a2d28524c4 Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Sun, 8 Dec 2024 22:59:26 -0800
Subject: [PATCH 10/12] chore: rebase

---
 examples/dynamo/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt
index 41fe29f09c..59a802918c 100644
--- a/examples/dynamo/requirements.txt
+++ b/examples/dynamo/requirements.txt
@@ -1,4 +1,4 @@
 cupy==13.1.0
 triton==2.3.0
 diffusers==0.30.3
-transformers==4.44.2
+transformers==4.44.2
\ No newline at end of file

From 4ed54100a0700990b4d3ed742b236dd28536992e Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Mon, 16 Dec 2024 10:52:10 -0800
Subject: [PATCH 11/12] chore: update docs

---
 docsrc/index.rst           | 2 ++
 examples/dynamo/README.rst | 1 +
 2 files changed, 3 insertions(+)

diff --git a/docsrc/index.rst b/docsrc/index.rst
index 5d88c8ecae..c762080649 100644
--- a/docsrc/index.rst
+++ b/docsrc/index.rst
@@ -134,6 +134,7 @@ Model Zoo
 * :ref:`torch_compile_resnet`
 * :ref:`torch_compile_transformer`
 * :ref:`torch_compile_stable_diffusion`
+* :ref:`torch_compile_gpt2`
 * :ref:`torch_export_gpt2`
 * :ref:`torch_export_llama2`
 * :ref:`notebooks`
@@ -148,6 +149,7 @@ Model Zoo
    tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion
    tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2
    tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion
+   tutorials/_rendered_examples/dynamo/torch_compile_gpt2
    tutorials/_rendered_examples/dynamo/torch_export_gpt2
    tutorials/_rendered_examples/dynamo/torch_export_llama2
    tutorials/notebooks
diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst
index 60f1969be2..5d3b9d4261 100644
--- a/examples/dynamo/README.rst
+++ b/examples/dynamo/README.rst
@@ -17,5 +17,6 @@ Model Zoo
 * :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile``
 * :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile``
 * :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile``
+* :ref:`_torch_compile_gpt2`: Compiling a GPT2 model using ``torch.compile``
 * :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`)
 * :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`)
\ No newline at end of file

From 9b8773c760250b0f168adbb7346831805a926c7d Mon Sep 17 00:00:00 2001
From: Dheeraj Peri <peri.dheeraj@gmail.com>
Date: Mon, 16 Dec 2024 12:05:17 -0800
Subject: [PATCH 12/12] chore: update docs

---
 examples/dynamo/torch_compile_gpt2.py | 67 +++++++++++++++++----------
 1 file changed, 42 insertions(+), 25 deletions(-)

diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py
index 6c6e1b03a2..5d41c3ed84 100644
--- a/examples/dynamo/torch_compile_gpt2.py
+++ b/examples/dynamo/torch_compile_gpt2.py
@@ -1,27 +1,40 @@
 """
 .. _torch_compile_gpt2:
 
-Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend
+Compiling GPT2 using the Torch-TensorRT ``torch.compile`` frontend
 ==========================================================
 
-This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model."""
+This example illustrates the state of the art model `GPT2 <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_ optimized using
+``torch.compile`` frontend of Torch-TensorRT. Install the following dependencies before compilation
+
+.. code-block:: python
+
+    pip install -r requirements.txt
+
+GPT2 is a causal (unidirectional) transformer pretrained using language modeling on a very large corpus of text data. In this example, we use the GPT2 model available at `HuggingFace <https://huggingface.co/docs/transformers/en/model_doc/gpt2>`_ and apply torch.compile on it to
+get the graph module representation of the graph. Torch-TensorRT converts this graph into an optimized TensorRT engine.
+"""
 
 # %%
-# Imports and Model Definition
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Import necessary libraries
+# -----------------------------
 import torch
 import torch_tensorrt
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 # %%
-
-# Define the parameters
-MAX_TOKENS = 32
+# Define the necessary parameters
+# -----------------------------
+# Torch-TensorRT requires a GPU for successful compilation of the model.
+# ``MAX_LENGTH`` is the maximum length the generated tokens can have. This corresponds to the length of the input prompt +
+# number of new tokens generated
+MAX_LENGTH = 32
 DEVICE = torch.device("cuda:0")
 
-# Define the GPT2 model from hugging face
-# kv_cache is not supported in Torch-TRT currently.
-# CPU is used here so that GPU memory is reserved for TRT compilation.
+# %%
+# Model definition
+# -----------------------------
+# We use ``AutoModelForCausalLM`` class to load the pretrained GPT2 model from hugging face. ``kv_cache`` is not supported in Torch-TRT currently so ``use_cache=False``
 with torch.no_grad():
     tokenizer = AutoTokenizer.from_pretrained("gpt2")
     model = (
@@ -36,24 +49,28 @@
     )
 
 # %%
+# PyTorch inference
+# -----------------------------
 # Tokenize a sample input prompt and get pytorch model outputs
 prompt = "I enjoy walking with my cute dog"
 model_inputs = tokenizer(prompt, return_tensors="pt")
 input_ids = model_inputs["input_ids"].cuda()
 
-# Auto-regressive generation loop for greedy search using PyTorch model.
+# %%
+# The ``generate()`` API of the ``AutoModelForCausalLM`` class is used for auto-regressive generation with greedy decoding.
 pyt_gen_tokens = model.generate(
     input_ids,
-    max_length=MAX_TOKENS,
+    max_length=MAX_LENGTH,
     use_cache=False,
     pad_token_id=tokenizer.eos_token_id,
 )
 
 # %%
-# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-# Compile the model and mark the input sequence length to be dynamic
+# Torch-TensorRT compilation and inference
+# -----------------------------
+# The input sequence length is dynamic, so we mark it using ``torch._dynamo.mark_dynamic`` API.
+# We provide a (min, max) range of this value so that TensorRT knows in advance what values to optimize for.
+# Usually, this would be the context length for the model. We start with ``min=2`` due to the `0/1 specialization <https://docs.google.com/document/d/16VPOa3d-Liikf48teAOmxLc92rgvJdfosIy-yoT38Io/edit?fbclid=IwAR3HNwmmexcitV0pbZm_x1a4ykdXZ9th_eJWK-3hBtVgKnrkmemz6Pm5jRQ&tab=t.0#heading=h.ez923tomjvyk>`_
 torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023)
 model.forward = torch.compile(
     model.forward,
@@ -63,25 +80,23 @@
         "enabled_precisions": {torch.float32},
         "disable_tf32": True,
         "min_block_size": 1,
-        "debug": True,
     },
 )
 
+# %%
 # Auto-regressive generation loop for greedy decoding using TensorRT model
 # The first token generation compiles the model using TensorRT and the second token
-# encounters recompilation
+# encounters recompilation (which is an issue currently that would be resolved in the future)
 trt_gen_tokens = model.generate(
     inputs=input_ids,
-    max_length=MAX_TOKENS,
+    max_length=MAX_LENGTH,
     use_cache=False,
     pad_token_id=tokenizer.eos_token_id,
 )
 
 # %%
 # Decode the output sentences of PyTorch and TensorRT
-# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-print("=============================")
+# -----------------------------
 print(
     "Pytorch model generated text: ",
     tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True),
@@ -95,6 +110,8 @@
 # %%
 # The output sentences should look like
 
-# Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
-# =============================
-# TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
+"""
+Pytorch model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
+=============================
+TensorRT model generated text:  I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll
+"""