From 2be9a8349f85f4cfcc4e26314bd5f165ef6d2b03 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Thu, 25 Nov 2021 11:35:28 -0800
Subject: [PATCH 1/2] Eliminate bunch of copies of error codepath from
 Utf8LossyChunksIter

Using a macro to stamp out 7 identical copies of the nontrivial slicing
logic to exit this loop didn't seem like a necessary use of a macro. The
early return case can be handled by `break` without practically any
changes to the logic inside the loop.

All this code is from early 2014 (7.5 years old, pre-1.0) so it's
possible there were compiler limitations that forced the macro way at
the time.

Confirmed that `x.py bench library/alloc --stage 0 --test-args from_utf8_lossy`
is unaffected on my machine.
---
 library/core/src/str/lossy.rs | 69 ++++++++++++++++-------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
index 6c21a5e802026..748ee314e73b9 100644
--- a/library/core/src/str/lossy.rs
+++ b/library/core/src/str/lossy.rs
@@ -61,36 +61,26 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
         }
 
         let mut i = 0;
+        let mut valid_up_to = 0;
         while i < self.source.len() {
-            let i_ = i;
-
-            // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and
-            // only increases, so `0 <= i < self.source.len()`.
+            // SAFETY: `i < self.source.len()` per previous line.
+            // For some reason the following are both significantly slower:
+            // while let Some(&byte) = self.source.get(i) {
+            // while let Some(byte) = self.source.get(i).copied() {
             let byte = unsafe { *self.source.get_unchecked(i) };
             i += 1;
 
             if byte < 128 {
+                // This could be a `1 => ...` case in the match below, but for
+                // the common case of all-ASCII inputs, we bypass loading the
+                // sizeable UTF8_CHAR_WIDTH table into cache.
             } else {
                 let w = utf8_char_width(byte);
 
-                macro_rules! error {
-                    () => {{
-                        // SAFETY: We have checked up to `i` that source is valid UTF-8.
-                        unsafe {
-                            let r = Utf8LossyChunk {
-                                valid: from_utf8_unchecked(&self.source[0..i_]),
-                                broken: &self.source[i_..i],
-                            };
-                            self.source = &self.source[i..];
-                            return Some(r);
-                        }
-                    }};
-                }
-
                 match w {
                     2 => {
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                     }
@@ -100,13 +90,11 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
                             (0xE1..=0xEC, 0x80..=0xBF) => (),
                             (0xED, 0x80..=0x9F) => (),
                             (0xEE..=0xEF, 0x80..=0xBF) => (),
-                            _ => {
-                                error!();
-                            }
+                            _ => break,
                         }
                         i += 1;
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                     }
@@ -115,34 +103,39 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
                             (0xF0, 0x90..=0xBF) => (),
                             (0xF1..=0xF3, 0x80..=0xBF) => (),
                             (0xF4, 0x80..=0x8F) => (),
-                            _ => {
-                                error!();
-                            }
+                            _ => break,
                         }
                         i += 1;
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                         if safe_get(self.source, i) & 192 != TAG_CONT_U8 {
-                            error!();
+                            break;
                         }
                         i += 1;
                     }
-                    _ => {
-                        error!();
-                    }
+                    _ => break,
                 }
             }
+
+            valid_up_to = i;
         }
 
-        let r = Utf8LossyChunk {
-            // SAFETY: We have checked that the entire source is valid UTF-8.
-            valid: unsafe { from_utf8_unchecked(self.source) },
-            broken: &[],
-        };
-        self.source = &[];
-        Some(r)
+        // SAFETY: `i <= self.source.len()` because it only ever increments by 1
+        // and the loop is terminated as soon as that goes beyond bounds.
+        let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
+        self.source = remaining;
+
+        // SAFETY: `valid_up_to <= i` because it is only ever assigned via
+        // `valid_up_to = i` and `i` only increases.
+        let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) };
+
+        Some(Utf8LossyChunk {
+            // SAFETY: All bytes up to `valid_up_to` are valid UTF-8.
+            valid: unsafe { from_utf8_unchecked(valid) },
+            broken,
+        })
     }
 }
 

From c6810a569f52feff03a36fb496780410b2912783 Mon Sep 17 00:00:00 2001
From: David Tolnay <dtolnay@gmail.com>
Date: Fri, 26 Nov 2021 12:57:36 -0800
Subject: [PATCH 2/2] Clarify safety comment on using i to index into
 self.source

---
 library/core/src/str/lossy.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs
index 748ee314e73b9..32bd22846e7dd 100644
--- a/library/core/src/str/lossy.rs
+++ b/library/core/src/str/lossy.rs
@@ -122,8 +122,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> {
             valid_up_to = i;
         }
 
-        // SAFETY: `i <= self.source.len()` because it only ever increments by 1
-        // and the loop is terminated as soon as that goes beyond bounds.
+        // SAFETY: `i <= self.source.len()` because it is only ever incremented
+        // via `i += 1` and in between every single one of those increments, `i`
+        // is compared against `self.source.len()`. That happens either
+        // literally by `i < self.source.len()` in the while-loop's condition,
+        // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The
+        // loop is terminated as soon as the latest `i += 1` has made `i` no
+        // longer less than `self.source.len()`, which means it'll be at most
+        // equal to `self.source.len()`.
         let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) };
         self.source = remaining;