pytorch · RdoubleA · Oct 10, 2024 · Sep 23, 2024 · Oct 8, 2024 · Oct 8, 2024
diff --git a/tests/torchtune/models/llama3/test_llama3_tokenizer.py b/tests/torchtune/models/llama3/test_llama3_tokenizer.py
@@ -428,3 +428,17 @@ def test_validate_special_tokens(self):
                     "<|python_tag|>": 128255,
                 },
             )
+
+    def test_skip_special_tokens(
+        self,
+        tokenizer,
+        user_text_message,
+        assistant_text_message,
+        user_text_a,
+        user_text_b,
+        assistant_text,
+    ):
+        # This should satisfy text = decode(encode(text))
+        tokens = user_text_message[1] + assistant_text_message[1]
+        text = tokenizer.decode(tokens, skip_special_tokens=True)
+        assert text == user_text_a + user_text_b + assistant_text
diff --git a/torchtune/models/llama3/_tokenizer.py b/torchtune/models/llama3/_tokenizer.py
@@ -4,6 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
+import re
 from typing import Any, Dict, List, Mapping, Optional, Tuple
 
 from torchtune.data import Message, PromptTemplate, truncate
@@ -113,6 +114,12 @@ def __init__(
 
         self.prompt_template = prompt_template
 
+        # Regex for removing special tokens from the decoded string
+        self._special_token_regex = re.compile(r"<\|.*?\|>")
+        self._special_token_header_regex = re.compile(
+            r"<\|start_header_id\|>.*?<\|end_header_id\|>\n\n"
+        )
+
     def _validate_special_tokens(
         self,
     ):
@@ -131,6 +138,15 @@ def _validate_special_tokens(
             if token not in self.special_tokens:
                 raise ValueError(f"{token} missing from special_tokens")
 
+    def _remove_special_tokens(self, text: str) -> str:
+        """
+        Remove special tokens from the decoded string.
+        """
+        # First remove the headers, then the remaining special tokens
+        return self._special_token_regex.sub(
+            "", self._special_token_header_regex.sub("", text)
+        )
+
     @property
     def base_vocab_size(self) -> int:
         return self.tt_model.base_vocab_size
@@ -166,11 +182,18 @@ def decode(
         Returns:
             str: The decoded string.
         """
-        return self.tt_model.decode(
-            token_ids,
+        # We will remove special tokens manually via regex on the decoded string.
+        # This is because removing all special tokens does not remove the role and
+        # whitespace added from the special tokens, i.e., the "user" and "\n\n" in
+        # "<|start_header_id|>user<|end_header_id|>\n\n"
+        decoded_string = self.tt_model.decode(
+            token_ids=token_ids,
             truncate_at_eos=truncate_at_eos,
-            skip_special_tokens=skip_special_tokens,
+            skip_special_tokens=False,
         )
+        if skip_special_tokens:
+            decoded_string = self._remove_special_tokens(decoded_string)
+        return decoded_string
 
     def _tokenize_header(self, message: Message) -> List[int]:
         """