From 2be9a8349f85f4cfcc4e26314bd5f165ef6d2b03 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Thu, 25 Nov 2021 11:35:28 -0800 Subject: [PATCH 1/2] Eliminate bunch of copies of error codepath from Utf8LossyChunksIter Using a macro to stamp out 7 identical copies of the nontrivial slicing logic to exit this loop didn't seem like a necessary use of a macro. The early return case can be handled by `break` without practically any changes to the logic inside the loop. All this code is from early 2014 (7.5 years old, pre-1.0) so it's possible there were compiler limitations that forced the macro way at the time. Confirmed that `x.py bench library/alloc --stage 0 --test-args from_utf8_lossy` is unaffected on my machine. --- library/core/src/str/lossy.rs | 69 ++++++++++++++++------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 6c21a5e802026..748ee314e73b9 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -61,36 +61,26 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { } let mut i = 0; + let mut valid_up_to = 0; while i < self.source.len() { - let i_ = i; - - // SAFETY: `i` starts at `0`, is less than `self.source.len()`, and - // only increases, so `0 <= i < self.source.len()`. + // SAFETY: `i < self.source.len()` per previous line. + // For some reason the following are both significantly slower: + // while let Some(&byte) = self.source.get(i) { + // while let Some(byte) = self.source.get(i).copied() { let byte = unsafe { *self.source.get_unchecked(i) }; i += 1; if byte < 128 { + // This could be a `1 => ...` case in the match below, but for + // the common case of all-ASCII inputs, we bypass loading the + // sizeable UTF8_CHAR_WIDTH table into cache. } else { let w = utf8_char_width(byte); - macro_rules! error { - () => {{ - // SAFETY: We have checked up to `i` that source is valid UTF-8. - unsafe { - let r = Utf8LossyChunk { - valid: from_utf8_unchecked(&self.source[0..i_]), - broken: &self.source[i_..i], - }; - self.source = &self.source[i..]; - return Some(r); - } - }}; - } - match w { 2 => { if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); + break; } i += 1; } @@ -100,13 +90,11 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { (0xE1..=0xEC, 0x80..=0xBF) => (), (0xED, 0x80..=0x9F) => (), (0xEE..=0xEF, 0x80..=0xBF) => (), - _ => { - error!(); - } + _ => break, } i += 1; if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); + break; } i += 1; } @@ -115,34 +103,39 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { (0xF0, 0x90..=0xBF) => (), (0xF1..=0xF3, 0x80..=0xBF) => (), (0xF4, 0x80..=0x8F) => (), - _ => { - error!(); - } + _ => break, } i += 1; if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); + break; } i += 1; if safe_get(self.source, i) & 192 != TAG_CONT_U8 { - error!(); + break; } i += 1; } - _ => { - error!(); - } + _ => break, } } + + valid_up_to = i; } - let r = Utf8LossyChunk { - // SAFETY: We have checked that the entire source is valid UTF-8. - valid: unsafe { from_utf8_unchecked(self.source) }, - broken: &[], - }; - self.source = &[]; - Some(r) + // SAFETY: `i <= self.source.len()` because it only ever increments by 1 + // and the loop is terminated as soon as that goes beyond bounds. + let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; + self.source = remaining; + + // SAFETY: `valid_up_to <= i` because it is only ever assigned via + // `valid_up_to = i` and `i` only increases. + let (valid, broken) = unsafe { inspected.split_at_unchecked(valid_up_to) }; + + Some(Utf8LossyChunk { + // SAFETY: All bytes up to `valid_up_to` are valid UTF-8. + valid: unsafe { from_utf8_unchecked(valid) }, + broken, + }) } } From c6810a569f52feff03a36fb496780410b2912783 Mon Sep 17 00:00:00 2001 From: David Tolnay Date: Fri, 26 Nov 2021 12:57:36 -0800 Subject: [PATCH 2/2] Clarify safety comment on using i to index into self.source --- library/core/src/str/lossy.rs | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/library/core/src/str/lossy.rs b/library/core/src/str/lossy.rs index 748ee314e73b9..32bd22846e7dd 100644 --- a/library/core/src/str/lossy.rs +++ b/library/core/src/str/lossy.rs @@ -122,8 +122,14 @@ impl<'a> Iterator for Utf8LossyChunksIter<'a> { valid_up_to = i; } - // SAFETY: `i <= self.source.len()` because it only ever increments by 1 - // and the loop is terminated as soon as that goes beyond bounds. + // SAFETY: `i <= self.source.len()` because it is only ever incremented + // via `i += 1` and in between every single one of those increments, `i` + // is compared against `self.source.len()`. That happens either + // literally by `i < self.source.len()` in the while-loop's condition, + // or indirectly by `safe_get(self.source, i) & 192 != TAG_CONT_U8`. The + // loop is terminated as soon as the latest `i += 1` has made `i` no + // longer less than `self.source.len()`, which means it'll be at most + // equal to `self.source.len()`. let (inspected, remaining) = unsafe { self.source.split_at_unchecked(i) }; self.source = remaining;