From 64baa52ebb2463328e4b6373cf5f8cda67d4ec45 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 9 Sep 2024 16:19:00 -0700 Subject: [PATCH 01/12] chore: updates --- examples/dynamo/utils.py | 9 ++++++++- py/torch_tensorrt/dynamo/_compiler.py | 1 + py/torch_tensorrt/dynamo/backend/backends.py | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py index 25ad99c12d..90f1f3b72c 100644 --- a/examples/dynamo/utils.py +++ b/examples/dynamo/utils.py @@ -51,7 +51,14 @@ def generate(model, input_seq, max_tokens, eos_token_id): ) while True: - outputs = model(input_seq) + outputs = model( + input_seq, + past_key_values=None, + position_ids=None, + attention_mask=None, + use_cache=False, + token_type_ids=None, + ) logits = outputs.logits next_token_logits = logits[:, -1, :] next_tokens = torch.argmax(next_token_logits, dim=-1) diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index 6cd3cf5f5f..faeee1ff29 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -288,6 +288,7 @@ def compile( trt_gm = compile_module( gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache ) + return trt_gm diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index 605d963a50..6570d6603a 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -87,11 +87,11 @@ def _pretraced_backend( # Remove detach nodes remove_detach(gm) - + # breakpoint() # Invoke AOTAutograd to translate operators to aten gm = aot_export_joint_simple( gm, - torch_inputs, + sample_inputs, trace_joint=False, decompositions=get_decompositions( settings.enable_experimental_decompositions From c5d3022dc8ae3b1e32db92b3d51e587bfce74aad Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Thu, 19 Sep 2024 14:25:52 -0700 Subject: [PATCH 02/12] chore: use HF generate instead of custom generate --- examples/dynamo/torch_compile_gpt2.py | 108 ++++++++++++++++++ examples/dynamo/torch_compile_llama2.py | 89 +++++++++++++++ py/torch_tensorrt/dynamo/backend/backends.py | 5 +- .../dynamo/lowering/_remove_sym_nodes.py | 14 ++- 4 files changed, 209 insertions(+), 7 deletions(-) create mode 100644 examples/dynamo/torch_compile_gpt2.py create mode 100644 examples/dynamo/torch_compile_llama2.py diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py new file mode 100644 index 0000000000..fb08f18882 --- /dev/null +++ b/examples/dynamo/torch_compile_gpt2.py @@ -0,0 +1,108 @@ +""" +.. _torch_compile_gpt2: + +Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer + +# %% + +# Define the parameters +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + +# Define the GPT2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +with torch.no_grad(): + tokenizer = AutoTokenizer.from_pretrained("gpt2") + model = ( + AutoModelForCausalLM.from_pretrained( + "gpt2", + pad_token_id=tokenizer.eos_token_id, + use_cache=False, + attn_implementation="eager", + ) + .eval() + .cuda() + ) + +# %% +# Tokenize a sample input prompt and get pytorch model outputs +prompt = "I enjoy walking with my cute dog" +model_inputs = tokenizer(prompt, return_tensors="pt") +input_ids = model_inputs["input_ids"].cuda() +position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).cuda() +attention_mask = torch.ones_like(position_ids) + + +# Auto-regressive generation loop for greedy search using PyTorch model. +pyt_gen_tokens = model.generate( + input_ids, + max_length=MAX_TOKENS, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, +) + +# %% +# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Compile the model and mark the input sequence length to be dynamic +torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) +torch._dynamo.mark_dynamic(position_ids, 1, min=2, max=1023) +torch._dynamo.mark_dynamic(attention_mask, 1, min=2, max=1023) +trt_model = torch.compile( + model, + backend="tensorrt", + dynamic=None, + options={ + "enabled_precisions": {torch.float32}, + "disable_tf32": True, + "min_block_size": 1, + }, +) +model_inputs = { + "input_ids": input_ids, + "position_ids": position_ids, + "attention_mask": attention_mask, +} +trt_model(input_ids) + +# Auto-regressive generation loop for greedy decoding using TensorRT model +trt_gen_tokens = trt_model.generate( + input_ids, + max_length=MAX_TOKENS, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, +) + +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +print("=============================") +print( + "Pytorch model generated text: ", + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), +) +print("=============================") +print( + "TensorRT model generated text: ", + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), +) + +# %% +# The output sentences should look like + +# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll +# ============================= +# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll diff --git a/examples/dynamo/torch_compile_llama2.py b/examples/dynamo/torch_compile_llama2.py new file mode 100644 index 0000000000..40ddc97d2c --- /dev/null +++ b/examples/dynamo/torch_compile_llama2.py @@ -0,0 +1,89 @@ +""" +.. _torch_compile_gpt2: + +Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend +========================================================== + +This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model.""" + +# %% +# Imports and Model Definition +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +import torch +import torch_tensorrt +from transformers import AutoModelForCausalLM, AutoTokenizer +from utils import generate + +# %% + +# Define the parameters +MAX_TOKENS = 32 +DEVICE = torch.device("cuda:0") + +# Define the GPT2 model from hugging face +# kv_cache is not supported in Torch-TRT currently. +# CPU is used here so that GPU memory is reserved for TRT compilation. +llama_path = "meta-llama/Llama-2-7b-chat-hf" +with torch.no_grad(): + model = AutoModelForCausalLM.from_pretrained( + llama_path, use_cache=False, attn_implementation="eager" + ).eval() + +tokenizer = AutoTokenizer.from_pretrained(llama_path) + +# %% +# Tokenize a sample input prompt and get pytorch model outputs +prompt = "I enjoy walking with my cute dog" +model_inputs = tokenizer(prompt, return_tensors="pt") +input_ids = model_inputs["input_ids"].cuda() + +# Auto-regressive generation loop for greedy search using PyTorch model. +# We use a custom generate function which is very similar to the huggingface one. +# pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +# Compile the model and mark the input sequence length to be dynamic +with torch_tensorrt.logging.debug(): + torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023) + model.forward = torch.compile( + model.forward, + backend="tensorrt", + dynamic=None, + options={ + "enabled_precisions": {torch.float32}, + "disable_tf32": True, + "debug": True, + # "use_python_runtime": True + }, + ) +model(input_ids) +breakpoint() +model(input_ids) +# Auto-regressive generation loop for greedy decoding using TensorRT model +# We use a custom generate function which is very similar to the huggingface one. +# Move inputs to GPU +input_ids = input_ids.to(DEVICE) +trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) + +# %% +# Decode the output sentences of PyTorch and TensorRT +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +print("=============================") +print( + "Pytorch model generated text: ", + tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), +) +print("=============================") +print( + "TensorRT model generated text: ", + tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), +) + +# %% +# The output sentences should look like +# +# diff --git a/py/torch_tensorrt/dynamo/backend/backends.py b/py/torch_tensorrt/dynamo/backend/backends.py index 6570d6603a..02cc6242aa 100644 --- a/py/torch_tensorrt/dynamo/backend/backends.py +++ b/py/torch_tensorrt/dynamo/backend/backends.py @@ -80,14 +80,15 @@ def _pretraced_backend( repair_input_aliasing(gm) # Remove sym_int placeholders and inputs - remove_sym_nodes(gm) + remove_sym_nodes(gm, sample_inputs) + torch_inputs = [ input for input in sample_inputs if isinstance(input, torch.Tensor) ] # Remove detach nodes remove_detach(gm) - # breakpoint() + # Invoke AOTAutograd to translate operators to aten gm = aot_export_joint_simple( gm, diff --git a/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py b/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py index 8adebc87f8..0042012761 100644 --- a/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py +++ b/py/torch_tensorrt/dynamo/lowering/_remove_sym_nodes.py @@ -1,18 +1,21 @@ import logging +from typing import Any, Sequence import torch logger = logging.getLogger(__name__) -def remove_sym_nodes(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: +def remove_sym_nodes( + gm: torch.fx.GraphModule, sample_inputs: Sequence[Any] +) -> torch.fx.GraphModule: """Remove sym_int placeholders which get inserted due to torch.compile's dynamic=True behavior """ # Extract SymInt placeholder Tensors - placeholder_sym_ints = [ - node - for node in gm.graph.nodes + placeholder_idx_sym_ints = [ + (idx, node) + for idx, node in enumerate(gm.graph.nodes) if ( node.op == "placeholder" and isinstance(node.type, type) @@ -21,8 +24,9 @@ def remove_sym_nodes(gm: torch.fx.GraphModule) -> torch.fx.GraphModule: ) ] - for node in placeholder_sym_ints: + for idx, node in placeholder_idx_sym_ints: gm.graph.erase_node(node) + sample_inputs.pop(idx) gm.graph.lint() gm.recompile() From c4f8945b0a537374c13a1dca4316f8bc60f0950d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 23 Sep 2024 13:54:11 -0700 Subject: [PATCH 03/12] chore: updates --- examples/dynamo/torch_compile_gpt2.py | 53 ++++++++++++--------------- 1 file changed, 23 insertions(+), 30 deletions(-) diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py index fb08f18882..9c81b4b45a 100644 --- a/examples/dynamo/torch_compile_gpt2.py +++ b/examples/dynamo/torch_compile_gpt2.py @@ -40,9 +40,6 @@ prompt = "I enjoy walking with my cute dog" model_inputs = tokenizer(prompt, return_tensors="pt") input_ids = model_inputs["input_ids"].cuda() -position_ids = torch.arange(input_ids.shape[1]).unsqueeze(0).cuda() -attention_mask = torch.ones_like(position_ids) - # Auto-regressive generation loop for greedy search using PyTorch model. pyt_gen_tokens = model.generate( @@ -56,34 +53,30 @@ # Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -# Compile the model and mark the input sequence length to be dynamic -torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) -torch._dynamo.mark_dynamic(position_ids, 1, min=2, max=1023) -torch._dynamo.mark_dynamic(attention_mask, 1, min=2, max=1023) -trt_model = torch.compile( - model, - backend="tensorrt", - dynamic=None, - options={ - "enabled_precisions": {torch.float32}, - "disable_tf32": True, - "min_block_size": 1, - }, -) -model_inputs = { - "input_ids": input_ids, - "position_ids": position_ids, - "attention_mask": attention_mask, -} -trt_model(input_ids) +with torch_tensorrt.logging.debug(): + # Compile the model and mark the input sequence length to be dynamic + torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) + model.forward = torch.compile( + model.forward, + backend="tensorrt", + dynamic=None, + options={ + "enabled_precisions": {torch.float32}, + "disable_tf32": True, + "min_block_size": 1, + "debug": True, + }, + ) -# Auto-regressive generation loop for greedy decoding using TensorRT model -trt_gen_tokens = trt_model.generate( - input_ids, - max_length=MAX_TOKENS, - use_cache=False, - pad_token_id=tokenizer.eos_token_id, -) + # Auto-regressive generation loop for greedy decoding using TensorRT model + # The first token generation compiles the model using TensorRT and the second token + # encounters recompilation + trt_gen_tokens = model.generate( + inputs=input_ids, + max_length=MAX_TOKENS, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, + ) # %% # Decode the output sentences of PyTorch and TensorRT From d5246f934187af968298629af2d42a011bace7b4 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Tue, 24 Sep 2024 10:40:08 -0700 Subject: [PATCH 04/12] chore: updates --- docsrc/index.rst | 2 ++ examples/dynamo/README.rst | 31 +++++++++++++----- examples/dynamo/requirements.txt | 4 +-- examples/dynamo/torch_compile_gpt2.py | 45 +++++++++++++-------------- 4 files changed, 50 insertions(+), 32 deletions(-) diff --git a/docsrc/index.rst b/docsrc/index.rst index 757acc2011..0bef2f0664 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -118,6 +118,8 @@ Tutorials tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2 tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion tutorials/_rendered_examples/dynamo/mutable_torchtrt_module_example + tutorials/_rendered_examples/dynamo/torch_export_gpt2 + tutorials/_rendered_examples/dynamo/torch_export_llama2 Python API Documentation ------------------------ diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst index ff3563cffe..6be2aa6515 100644 --- a/examples/dynamo/README.rst +++ b/examples/dynamo/README.rst @@ -1,15 +1,24 @@ .. _torch_compile: -Dynamo / ``torch.compile`` ----------------------------- +Torch-TensorRT Examples +==================================== -Torch-TensorRT provides a backend for the new ``torch.compile`` API released in PyTorch 2.0. In the following examples we describe -a number of ways you can leverage this backend to accelerate inference. +Please refer to the following examples which demonstrate the usage of different features of Torch-TensorRT. We also provide +examples of Torch-TensorRT compilation of select computer vision and language models. -* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile`` -* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile`` +Dependencies +------------------------------------ + +Please install the following external depencies (assuming you already have `torch_tensorrt` installed) + +.. code-block:: python + + pip install -r requirements.txt + + +Compiler Features +------------------------------------ * :ref:`torch_compile_advanced_usage`: Advanced usage including making a custom backend to use directly with the ``torch.compile`` API -* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile`` * :ref:`torch_export_cudagraphs`: Using the Cudagraphs integration with `ir="dynamo"` * :ref:`custom_kernel_plugins`: Creating a plugin to use a custom kernel inside TensorRT engines * :ref:`refit_engine_example`: Refitting a compiled TensorRT Graph Module with updated weights @@ -17,3 +26,11 @@ a number of ways you can leverage this backend to accelerate inference. * :ref:`vgg16_fp8_ptq`: Compiling a VGG16 model with FP8 and PTQ using ``torch.compile`` * :ref:`engine_caching_example`: Utilizing engine caching to speed up compilation times * :ref:`engine_caching_bert_example`: Demonstrating engine caching on BERT + +Model Zoo +------------------------------------ +* :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile`` +* :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile`` +* :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile`` +* :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`) +* :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`) \ No newline at end of file diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt index 6e53935186..41fe29f09c 100644 --- a/examples/dynamo/requirements.txt +++ b/examples/dynamo/requirements.txt @@ -1,4 +1,4 @@ cupy==13.1.0 -torch>=2.4.0.dev20240503+cu121 -torch-tensorrt>=2.4.0.dev20240503+cu121 triton==2.3.0 +diffusers==0.30.3 +transformers==4.44.2 diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py index 9c81b4b45a..6c6e1b03a2 100644 --- a/examples/dynamo/torch_compile_gpt2.py +++ b/examples/dynamo/torch_compile_gpt2.py @@ -53,30 +53,29 @@ # Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -with torch_tensorrt.logging.debug(): - # Compile the model and mark the input sequence length to be dynamic - torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) - model.forward = torch.compile( - model.forward, - backend="tensorrt", - dynamic=None, - options={ - "enabled_precisions": {torch.float32}, - "disable_tf32": True, - "min_block_size": 1, - "debug": True, - }, - ) +# Compile the model and mark the input sequence length to be dynamic +torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) +model.forward = torch.compile( + model.forward, + backend="tensorrt", + dynamic=None, + options={ + "enabled_precisions": {torch.float32}, + "disable_tf32": True, + "min_block_size": 1, + "debug": True, + }, +) - # Auto-regressive generation loop for greedy decoding using TensorRT model - # The first token generation compiles the model using TensorRT and the second token - # encounters recompilation - trt_gen_tokens = model.generate( - inputs=input_ids, - max_length=MAX_TOKENS, - use_cache=False, - pad_token_id=tokenizer.eos_token_id, - ) +# Auto-regressive generation loop for greedy decoding using TensorRT model +# The first token generation compiles the model using TensorRT and the second token +# encounters recompilation +trt_gen_tokens = model.generate( + inputs=input_ids, + max_length=MAX_TOKENS, + use_cache=False, + pad_token_id=tokenizer.eos_token_id, +) # %% # Decode the output sentences of PyTorch and TensorRT From 5bfc1eca3fee20e5e096b793c469764effcda226 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 21 Oct 2024 17:27:36 -0700 Subject: [PATCH 05/12] chore: updates --- py/requirements.txt | 2 +- py/torch_tensorrt/dynamo/lowering/_decompositions.py | 3 ++- pyproject.toml | 8 ++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/py/requirements.txt b/py/requirements.txt index 361afab365..d480ccbd57 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -3,6 +3,6 @@ packaging pybind11==2.6.2 --extra-index-url https://download.pytorch.org/whl/nightly/cu124 torch>=2.6.0.dev,<2.7.0 -torchvision>=0.20.0.dev,<0.21.0 +#torchvision>=0.20.0.dev,<0.21.0 --extra-index-url https://pypi.ngc.nvidia.com pyyaml diff --git a/py/torch_tensorrt/dynamo/lowering/_decompositions.py b/py/torch_tensorrt/dynamo/lowering/_decompositions.py index 534bc3eac5..d2bfeb501a 100644 --- a/py/torch_tensorrt/dynamo/lowering/_decompositions.py +++ b/py/torch_tensorrt/dynamo/lowering/_decompositions.py @@ -3,7 +3,8 @@ from typing import Any, Callable, Dict, List, Optional import torch -from torch._decomp import _decomp_table_to_post_autograd_aten, register_decomposition +from torch._decomp import register_decomposition +from torch._export.utils import _decomp_table_to_post_autograd_aten from torch._ops import OpOverload from torch_tensorrt.dynamo._defaults import default_device from torch_tensorrt.dynamo.conversion.converter_utils import get_positive_dim diff --git a/pyproject.toml b/pyproject.toml index 1284e458f4..1ec43f76fe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ requires = [ "cffi>=1.15.1", "typing-extensions>=4.7.0", "future>=0.18.3", - "tensorrt-cu12==10.3.0", + #"tensorrt-cu12==10.3.0", "torch>=2.6.0.dev,<2.7.0", "pybind11==2.6.2", "numpy", @@ -55,9 +55,9 @@ keywords = [ ] dependencies = [ "torch>=2.6.0.dev,<2.7.0", - "tensorrt-cu12==10.3.0", - "tensorrt-cu12-bindings==10.3.0", - "tensorrt-cu12-libs==10.3.0", + #"tensorrt-cu12==10.3.0", + #"tensorrt-cu12-bindings==10.3.0", + #"tensorrt-cu12-libs==10.3.0", "packaging>=23", "numpy", "typing-extensions>=4.7.0", From 8cf1c413aafeb49b90177cd0fa2bbc8cc9940b00 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 21 Oct 2024 18:00:09 -0700 Subject: [PATCH 06/12] chore: updates --- docker/Dockerfile.lab | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 docker/Dockerfile.lab diff --git a/docker/Dockerfile.lab b/docker/Dockerfile.lab new file mode 100644 index 0000000000..569acfecad --- /dev/null +++ b/docker/Dockerfile.lab @@ -0,0 +1,33 @@ +# syntax=docker/dockerfile:1 + +# Base image starts with CUDA +ARG BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04 +FROM ${BASE_IMG} as base +ENV BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04 + +ARG PYTHON_VERSION=3.10 +ENV PYTHON_VERSION=${PYTHON_VERSION} + +ARG USE_CXX11_ABI +ENV USE_CXX11=${USE_CXX11_ABI} +ENV DEBIAN_FRONTEND=noninteractive + +# Install basic dependencies +RUN apt-get update +RUN apt install -y vim build-essential manpages-dev wget zlib1g software-properties-common git libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8 + +# Install PyEnv and desired Python version +ENV HOME="/root" +ENV PYENV_DIR="$HOME/.pyenv" +ENV PATH="$PYENV_DIR/shims:$PYENV_DIR/bin:$PATH" +RUN wget -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer &&\ + chmod 755 pyenv-installer &&\ + bash pyenv-installer &&\ + eval "$(pyenv init -)" + +RUN pyenv install -v ${PYTHON_VERSION} +RUN pyenv global ${PYTHON_VERSION} + +# Setup Bazel via Bazelisk +RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\ + chmod a+x /usr/bin/bazel From 289f84dfeeab492592b0dbeb0aa69c14a5c0fd2a Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Sun, 8 Dec 2024 21:49:42 -0800 Subject: [PATCH 07/12] chore: updates --- examples/dynamo/torch_export_gpt2.py | 22 ++++++++++++---------- examples/dynamo/utils.py | 9 +-------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index cea0f3adf2..c055de5e3c 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -57,16 +57,18 @@ # 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation # 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch) gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) -trt_model = torch_tensorrt.dynamo.compile( - gpt2_ep, - inputs=[input_ids], - enabled_precisions={torch.float32}, - truncate_double=True, - device=DEVICE, - disable_tf32=True, - use_explicit_typing=True, - use_fp32_acc=True, -) +with torch_tensorrt.logging.debug(): + trt_model = torch_tensorrt.dynamo.compile( + gpt2_ep, + inputs=[input_ids], + enabled_precisions={torch.float32}, + truncate_double=True, + device=DEVICE, + disable_tf32=True, + use_explicit_typing=True, + use_fp32_acc=True, + debug=True, + ) # Auto-regressive generation loop for greedy decoding using TensorRT model # We use a custom generate function which is very similar to the huggingface one. diff --git a/examples/dynamo/utils.py b/examples/dynamo/utils.py index 90f1f3b72c..25ad99c12d 100644 --- a/examples/dynamo/utils.py +++ b/examples/dynamo/utils.py @@ -51,14 +51,7 @@ def generate(model, input_seq, max_tokens, eos_token_id): ) while True: - outputs = model( - input_seq, - past_key_values=None, - position_ids=None, - attention_mask=None, - use_cache=False, - token_type_ids=None, - ) + outputs = model(input_seq) logits = outputs.logits next_token_logits = logits[:, -1, :] next_tokens = torch.argmax(next_token_logits, dim=-1) From cb9409f30a76f1398d66b7bb06ea70dfd44bd9ac Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Sun, 8 Dec 2024 22:56:29 -0800 Subject: [PATCH 08/12] chore: clean up --- examples/dynamo/torch_compile_llama2.py | 89 ------------------------- examples/dynamo/torch_export_gpt2.py | 22 +++--- py/requirements.txt | 2 +- py/torch_tensorrt/dynamo/_compiler.py | 1 - pyproject.toml | 8 +-- 5 files changed, 15 insertions(+), 107 deletions(-) delete mode 100644 examples/dynamo/torch_compile_llama2.py diff --git a/examples/dynamo/torch_compile_llama2.py b/examples/dynamo/torch_compile_llama2.py deleted file mode 100644 index 40ddc97d2c..0000000000 --- a/examples/dynamo/torch_compile_llama2.py +++ /dev/null @@ -1,89 +0,0 @@ -""" -.. _torch_compile_gpt2: - -Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend -========================================================== - -This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model.""" - -# %% -# Imports and Model Definition -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -import torch -import torch_tensorrt -from transformers import AutoModelForCausalLM, AutoTokenizer -from utils import generate - -# %% - -# Define the parameters -MAX_TOKENS = 32 -DEVICE = torch.device("cuda:0") - -# Define the GPT2 model from hugging face -# kv_cache is not supported in Torch-TRT currently. -# CPU is used here so that GPU memory is reserved for TRT compilation. -llama_path = "meta-llama/Llama-2-7b-chat-hf" -with torch.no_grad(): - model = AutoModelForCausalLM.from_pretrained( - llama_path, use_cache=False, attn_implementation="eager" - ).eval() - -tokenizer = AutoTokenizer.from_pretrained(llama_path) - -# %% -# Tokenize a sample input prompt and get pytorch model outputs -prompt = "I enjoy walking with my cute dog" -model_inputs = tokenizer(prompt, return_tensors="pt") -input_ids = model_inputs["input_ids"].cuda() - -# Auto-regressive generation loop for greedy search using PyTorch model. -# We use a custom generate function which is very similar to the huggingface one. -# pyt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) - -# %% -# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -# Compile the model and mark the input sequence length to be dynamic -with torch_tensorrt.logging.debug(): - torch._dynamo.mark_dynamic(input_ids, 1, min=7, max=1023) - model.forward = torch.compile( - model.forward, - backend="tensorrt", - dynamic=None, - options={ - "enabled_precisions": {torch.float32}, - "disable_tf32": True, - "debug": True, - # "use_python_runtime": True - }, - ) -model(input_ids) -breakpoint() -model(input_ids) -# Auto-regressive generation loop for greedy decoding using TensorRT model -# We use a custom generate function which is very similar to the huggingface one. -# Move inputs to GPU -input_ids = input_ids.to(DEVICE) -trt_gen_tokens = generate(model, input_ids, MAX_TOKENS, tokenizer.eos_token_id) - -# %% -# Decode the output sentences of PyTorch and TensorRT -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -print("=============================") -print( - "Pytorch model generated text: ", - tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), -) -print("=============================") -print( - "TensorRT model generated text: ", - tokenizer.decode(trt_gen_tokens[0], skip_special_tokens=True), -) - -# %% -# The output sentences should look like -# -# diff --git a/examples/dynamo/torch_export_gpt2.py b/examples/dynamo/torch_export_gpt2.py index c055de5e3c..cea0f3adf2 100644 --- a/examples/dynamo/torch_export_gpt2.py +++ b/examples/dynamo/torch_export_gpt2.py @@ -57,18 +57,16 @@ # 2) Enable use_explicit_typing=True. Certain layers are explicitly casted to FP32 within the pytorch model and this flag respects this behavior during TRT compilation # 3) Enable use_fp32_acc=True. This ensures all the matmuls are accumulated in FP32 precision (similar to PyTorch) gpt2_ep = export_llm(model, input_ids, max_seq_len=1024) -with torch_tensorrt.logging.debug(): - trt_model = torch_tensorrt.dynamo.compile( - gpt2_ep, - inputs=[input_ids], - enabled_precisions={torch.float32}, - truncate_double=True, - device=DEVICE, - disable_tf32=True, - use_explicit_typing=True, - use_fp32_acc=True, - debug=True, - ) +trt_model = torch_tensorrt.dynamo.compile( + gpt2_ep, + inputs=[input_ids], + enabled_precisions={torch.float32}, + truncate_double=True, + device=DEVICE, + disable_tf32=True, + use_explicit_typing=True, + use_fp32_acc=True, +) # Auto-regressive generation loop for greedy decoding using TensorRT model # We use a custom generate function which is very similar to the huggingface one. diff --git a/py/requirements.txt b/py/requirements.txt index d480ccbd57..361afab365 100644 --- a/py/requirements.txt +++ b/py/requirements.txt @@ -3,6 +3,6 @@ packaging pybind11==2.6.2 --extra-index-url https://download.pytorch.org/whl/nightly/cu124 torch>=2.6.0.dev,<2.7.0 -#torchvision>=0.20.0.dev,<0.21.0 +torchvision>=0.20.0.dev,<0.21.0 --extra-index-url https://pypi.ngc.nvidia.com pyyaml diff --git a/py/torch_tensorrt/dynamo/_compiler.py b/py/torch_tensorrt/dynamo/_compiler.py index d7792e7464..9859668cd9 100644 --- a/py/torch_tensorrt/dynamo/_compiler.py +++ b/py/torch_tensorrt/dynamo/_compiler.py @@ -608,7 +608,6 @@ def compile( trt_gm = compile_module( gm, trt_arg_inputs, trt_kwarg_inputs, settings, engine_cache ) - return trt_gm diff --git a/pyproject.toml b/pyproject.toml index 1ec43f76fe..1284e458f4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ requires = [ "cffi>=1.15.1", "typing-extensions>=4.7.0", "future>=0.18.3", - #"tensorrt-cu12==10.3.0", + "tensorrt-cu12==10.3.0", "torch>=2.6.0.dev,<2.7.0", "pybind11==2.6.2", "numpy", @@ -55,9 +55,9 @@ keywords = [ ] dependencies = [ "torch>=2.6.0.dev,<2.7.0", - #"tensorrt-cu12==10.3.0", - #"tensorrt-cu12-bindings==10.3.0", - #"tensorrt-cu12-libs==10.3.0", + "tensorrt-cu12==10.3.0", + "tensorrt-cu12-bindings==10.3.0", + "tensorrt-cu12-libs==10.3.0", "packaging>=23", "numpy", "typing-extensions>=4.7.0", From 8725a483015f9fa64383deaabfd645ae9436ab37 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Sun, 8 Dec 2024 22:58:16 -0800 Subject: [PATCH 09/12] chore: remove redundant file --- docker/Dockerfile.lab | 33 --------------------------------- 1 file changed, 33 deletions(-) delete mode 100644 docker/Dockerfile.lab diff --git a/docker/Dockerfile.lab b/docker/Dockerfile.lab deleted file mode 100644 index 569acfecad..0000000000 --- a/docker/Dockerfile.lab +++ /dev/null @@ -1,33 +0,0 @@ -# syntax=docker/dockerfile:1 - -# Base image starts with CUDA -ARG BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04 -FROM ${BASE_IMG} as base -ENV BASE_IMG=nvidia/cuda:12.4.1-devel-ubuntu22.04 - -ARG PYTHON_VERSION=3.10 -ENV PYTHON_VERSION=${PYTHON_VERSION} - -ARG USE_CXX11_ABI -ENV USE_CXX11=${USE_CXX11_ABI} -ENV DEBIAN_FRONTEND=noninteractive - -# Install basic dependencies -RUN apt-get update -RUN apt install -y vim build-essential manpages-dev wget zlib1g software-properties-common git libssl-dev zlib1g-dev libbz2-dev libreadline-dev libsqlite3-dev wget ca-certificates curl llvm libncurses5-dev xz-utils tk-dev libxml2-dev libxmlsec1-dev libffi-dev liblzma-dev mecab-ipadic-utf8 - -# Install PyEnv and desired Python version -ENV HOME="/root" -ENV PYENV_DIR="$HOME/.pyenv" -ENV PATH="$PYENV_DIR/shims:$PYENV_DIR/bin:$PATH" -RUN wget -L https://github.com/pyenv/pyenv-installer/raw/master/bin/pyenv-installer &&\ - chmod 755 pyenv-installer &&\ - bash pyenv-installer &&\ - eval "$(pyenv init -)" - -RUN pyenv install -v ${PYTHON_VERSION} -RUN pyenv global ${PYTHON_VERSION} - -# Setup Bazel via Bazelisk -RUN wget -q https://github.com/bazelbuild/bazelisk/releases/download/v1.17.0/bazelisk-linux-amd64 -O /usr/bin/bazel &&\ - chmod a+x /usr/bin/bazel From 332d32042788dfa16f66620cd66778a2d28524c4 Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Sun, 8 Dec 2024 22:59:26 -0800 Subject: [PATCH 10/12] chore: rebase --- examples/dynamo/requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/dynamo/requirements.txt b/examples/dynamo/requirements.txt index 41fe29f09c..59a802918c 100644 --- a/examples/dynamo/requirements.txt +++ b/examples/dynamo/requirements.txt @@ -1,4 +1,4 @@ cupy==13.1.0 triton==2.3.0 diffusers==0.30.3 -transformers==4.44.2 +transformers==4.44.2 \ No newline at end of file From 4ed54100a0700990b4d3ed742b236dd28536992e Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 16 Dec 2024 10:52:10 -0800 Subject: [PATCH 11/12] chore: update docs --- docsrc/index.rst | 2 ++ examples/dynamo/README.rst | 1 + 2 files changed, 3 insertions(+) diff --git a/docsrc/index.rst b/docsrc/index.rst index 5d88c8ecae..c762080649 100644 --- a/docsrc/index.rst +++ b/docsrc/index.rst @@ -134,6 +134,7 @@ Model Zoo * :ref:`torch_compile_resnet` * :ref:`torch_compile_transformer` * :ref:`torch_compile_stable_diffusion` +* :ref:`torch_compile_gpt2` * :ref:`torch_export_gpt2` * :ref:`torch_export_llama2` * :ref:`notebooks` @@ -148,6 +149,7 @@ Model Zoo tutorials/_rendered_examples/dynamo/torch_compile_stable_diffusion tutorials/_rendered_examples/distributed_inference/data_parallel_gpt2 tutorials/_rendered_examples/distributed_inference/data_parallel_stable_diffusion + tutorials/_rendered_examples/dynamo/torch_compile_gpt2 tutorials/_rendered_examples/dynamo/torch_export_gpt2 tutorials/_rendered_examples/dynamo/torch_export_llama2 tutorials/notebooks diff --git a/examples/dynamo/README.rst b/examples/dynamo/README.rst index 60f1969be2..5d3b9d4261 100644 --- a/examples/dynamo/README.rst +++ b/examples/dynamo/README.rst @@ -17,5 +17,6 @@ Model Zoo * :ref:`torch_compile_resnet`: Compiling a ResNet model using the Torch Compile Frontend for ``torch_tensorrt.compile`` * :ref:`torch_compile_transformer`: Compiling a Transformer model using ``torch.compile`` * :ref:`torch_compile_stable_diffusion`: Compiling a Stable Diffusion model using ``torch.compile`` +* :ref:`_torch_compile_gpt2`: Compiling a GPT2 model using ``torch.compile`` * :ref:`_torch_export_gpt2`: Compiling a GPT2 model using AOT workflow (`ir=dynamo`) * :ref:`_torch_export_llama2`: Compiling a Llama2 model using AOT workflow (`ir=dynamo`) \ No newline at end of file From 9b8773c760250b0f168adbb7346831805a926c7d Mon Sep 17 00:00:00 2001 From: Dheeraj Peri Date: Mon, 16 Dec 2024 12:05:17 -0800 Subject: [PATCH 12/12] chore: update docs --- examples/dynamo/torch_compile_gpt2.py | 67 +++++++++++++++++---------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/examples/dynamo/torch_compile_gpt2.py b/examples/dynamo/torch_compile_gpt2.py index 6c6e1b03a2..5d41c3ed84 100644 --- a/examples/dynamo/torch_compile_gpt2.py +++ b/examples/dynamo/torch_compile_gpt2.py @@ -1,27 +1,40 @@ """ .. _torch_compile_gpt2: -Compiling GPT2 using the Torch-TensorRT `torch.compile` Backend +Compiling GPT2 using the Torch-TensorRT ``torch.compile`` frontend ========================================================== -This interactive script is intended as a sample of the Torch-TensorRT workflow with `torch.compile` on a GPT2 model.""" +This example illustrates the state of the art model `GPT2 `_ optimized using +``torch.compile`` frontend of Torch-TensorRT. Install the following dependencies before compilation + +.. code-block:: python + + pip install -r requirements.txt + +GPT2 is a causal (unidirectional) transformer pretrained using language modeling on a very large corpus of text data. In this example, we use the GPT2 model available at `HuggingFace `_ and apply torch.compile on it to +get the graph module representation of the graph. Torch-TensorRT converts this graph into an optimized TensorRT engine. +""" # %% -# Imports and Model Definition -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Import necessary libraries +# ----------------------------- import torch import torch_tensorrt from transformers import AutoModelForCausalLM, AutoTokenizer # %% - -# Define the parameters -MAX_TOKENS = 32 +# Define the necessary parameters +# ----------------------------- +# Torch-TensorRT requires a GPU for successful compilation of the model. +# ``MAX_LENGTH`` is the maximum length the generated tokens can have. This corresponds to the length of the input prompt + +# number of new tokens generated +MAX_LENGTH = 32 DEVICE = torch.device("cuda:0") -# Define the GPT2 model from hugging face -# kv_cache is not supported in Torch-TRT currently. -# CPU is used here so that GPU memory is reserved for TRT compilation. +# %% +# Model definition +# ----------------------------- +# We use ``AutoModelForCausalLM`` class to load the pretrained GPT2 model from hugging face. ``kv_cache`` is not supported in Torch-TRT currently so ``use_cache=False`` with torch.no_grad(): tokenizer = AutoTokenizer.from_pretrained("gpt2") model = ( @@ -36,24 +49,28 @@ ) # %% +# PyTorch inference +# ----------------------------- # Tokenize a sample input prompt and get pytorch model outputs prompt = "I enjoy walking with my cute dog" model_inputs = tokenizer(prompt, return_tensors="pt") input_ids = model_inputs["input_ids"].cuda() -# Auto-regressive generation loop for greedy search using PyTorch model. +# %% +# The ``generate()`` API of the ``AutoModelForCausalLM`` class is used for auto-regressive generation with greedy decoding. pyt_gen_tokens = model.generate( input_ids, - max_length=MAX_TOKENS, + max_length=MAX_LENGTH, use_cache=False, pad_token_id=tokenizer.eos_token_id, ) # %% -# Compilation with `torch.compile` using tensorrt backend and generate TensorRT outputs -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -# Compile the model and mark the input sequence length to be dynamic +# Torch-TensorRT compilation and inference +# ----------------------------- +# The input sequence length is dynamic, so we mark it using ``torch._dynamo.mark_dynamic`` API. +# We provide a (min, max) range of this value so that TensorRT knows in advance what values to optimize for. +# Usually, this would be the context length for the model. We start with ``min=2`` due to the `0/1 specialization `_ torch._dynamo.mark_dynamic(input_ids, 1, min=2, max=1023) model.forward = torch.compile( model.forward, @@ -63,25 +80,23 @@ "enabled_precisions": {torch.float32}, "disable_tf32": True, "min_block_size": 1, - "debug": True, }, ) +# %% # Auto-regressive generation loop for greedy decoding using TensorRT model # The first token generation compiles the model using TensorRT and the second token -# encounters recompilation +# encounters recompilation (which is an issue currently that would be resolved in the future) trt_gen_tokens = model.generate( inputs=input_ids, - max_length=MAX_TOKENS, + max_length=MAX_LENGTH, use_cache=False, pad_token_id=tokenizer.eos_token_id, ) # %% # Decode the output sentences of PyTorch and TensorRT -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -print("=============================") +# ----------------------------- print( "Pytorch model generated text: ", tokenizer.decode(pyt_gen_tokens[0], skip_special_tokens=True), @@ -95,6 +110,8 @@ # %% # The output sentences should look like -# Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll -# ============================= -# TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll +""" +Pytorch model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll +============================= +TensorRT model generated text: I enjoy walking with my cute dog, but I'm not sure if I'll ever be able to walk with my dog. I'm not sure if I'll +"""