From 6d9ee6ee26c296d9361d5f0fb479b2462687fe4e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Esteban=20K=C3=BCber?= Date: Thu, 25 Jul 2024 18:38:28 +0000 Subject: [PATCH 1/3] Change output normalization logic to be linear against size of output Scan strings to be normalized for printing in a linear scan and collect the resulting `String` only once. Use a binary search when looking for chars to be replaced, instead of a `HashMap::get`. --- Cargo.lock | 1 + compiler/rustc_errors/Cargo.toml | 1 + compiler/rustc_errors/src/emitter.rs | 98 +++++++++++++++------------- 3 files changed, 54 insertions(+), 46 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 281599a21fc14..43fbaf966b79f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3867,6 +3867,7 @@ version = "0.0.0" dependencies = [ "annotate-snippets 0.10.2", "derive_setters", + "either", "rustc_ast", "rustc_ast_pretty", "rustc_data_structures", diff --git a/compiler/rustc_errors/Cargo.toml b/compiler/rustc_errors/Cargo.toml index 2fff9f2de50fb..ddf72a2d5fdb2 100644 --- a/compiler/rustc_errors/Cargo.toml +++ b/compiler/rustc_errors/Cargo.toml @@ -7,6 +7,7 @@ edition = "2021" # tidy-alphabetical-start annotate-snippets = "0.10" derive_setters = "0.1.6" +either = "1.5.0" rustc_ast = { path = "../rustc_ast" } rustc_ast_pretty = { path = "../rustc_ast_pretty" } rustc_data_structures = { path = "../rustc_data_structures" } diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs index 73908e5808569..d186996040bd2 100644 --- a/compiler/rustc_errors/src/emitter.rs +++ b/compiler/rustc_errors/src/emitter.rs @@ -16,6 +16,7 @@ use std::iter; use std::path::Path; use derive_setters::Setters; +use either::Either; use rustc_data_structures::fx::{FxHashMap, FxIndexMap, FxIndexSet}; use rustc_data_structures::sync::{DynSend, IntoDynSyncSend, Lrc}; use rustc_error_messages::{FluentArgs, SpanLabel}; @@ -2559,60 +2560,65 @@ fn num_decimal_digits(num: usize) -> usize { // We replace some characters so the CLI output is always consistent and underlines aligned. // Keep the following list in sync with `rustc_span::char_width`. +// ATTENTION: keep lexicografically sorted so that the binary search will work const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ - ('\t', " "), // We do our own tab replacement - ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters. - ('\u{202A}', "�"), // The following unicode text flow control characters are inconsistently - ('\u{202B}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk - ('\u{202D}', "�"), // not corresponding to the visible source code, so we replace them always. - ('\u{202E}', "�"), + // In terminals without Unicode support the following will be garbled, but in *all* terminals + // the underlying codepoint will be as well. We could gate this replacement behind a "unicode + // support" gate. + ('\0', "␀"), + ('\u{1}', "␁"), + ('\u{2}', "␂"), + ('\u{3}', "␃"), + ('\u{4}', "␄"), + ('\u{5}', "␅"), + ('\u{6}', "␆"), + ('\u{7}', "␇"), + ('\u{8}', "␈"), + ('\t', " "), // We do our own tab replacement + ('\u{b}', "␋"), + ('\u{c}', "␌"), + ('\r', "␍"), + ('\u{e}', "␎"), + ('\u{f}', "␏"), + ('\u{10}', "␐"), + ('\u{11}', "␑"), + ('\u{12}', "␒"), + ('\u{13}', "␓"), + ('\u{14}', "␔"), + ('\u{15}', "␕"), + ('\u{16}', "␖"), + ('\u{17}', "␗"), + ('\u{18}', "␘"), + ('\u{19}', "␙"), + ('\u{1a}', "␚"), + ('\u{1b}', "␛"), + ('\u{1c}', "␜"), + ('\u{1d}', "␝"), + ('\u{1e}', "␞"), + ('\u{1f}', "␟"), + ('\u{7f}', "␡"), + ('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters. + ('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently + ('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk + ('\u{202c}', "�"), // not corresponding to the visible source code, so we replace them always. + ('\u{202d}', "�"), + ('\u{202e}', "�"), ('\u{2066}', "�"), ('\u{2067}', "�"), ('\u{2068}', "�"), - ('\u{202C}', "�"), ('\u{2069}', "�"), - // In terminals without Unicode support the following will be garbled, but in *all* terminals - // the underlying codepoint will be as well. We could gate this replacement behind a "unicode - // support" gate. - ('\u{0000}', "␀"), - ('\u{0001}', "␁"), - ('\u{0002}', "␂"), - ('\u{0003}', "␃"), - ('\u{0004}', "␄"), - ('\u{0005}', "␅"), - ('\u{0006}', "␆"), - ('\u{0007}', "␇"), - ('\u{0008}', "␈"), - ('\u{000B}', "␋"), - ('\u{000C}', "␌"), - ('\u{000D}', "␍"), - ('\u{000E}', "␎"), - ('\u{000F}', "␏"), - ('\u{0010}', "␐"), - ('\u{0011}', "␑"), - ('\u{0012}', "␒"), - ('\u{0013}', "␓"), - ('\u{0014}', "␔"), - ('\u{0015}', "␕"), - ('\u{0016}', "␖"), - ('\u{0017}', "␗"), - ('\u{0018}', "␘"), - ('\u{0019}', "␙"), - ('\u{001A}', "␚"), - ('\u{001B}', "␛"), - ('\u{001C}', "␜"), - ('\u{001D}', "␝"), - ('\u{001E}', "␞"), - ('\u{001F}', "␟"), - ('\u{007F}', "␡"), ]; fn normalize_whitespace(str: &str) -> String { - let mut s = str.to_string(); - for (c, replacement) in OUTPUT_REPLACEMENTS { - s = s.replace(*c, replacement); - } - s + // Scan the input string for a character in the ordered table above. If it's present, replace + // it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input + // char. At the end, allocate all chars into a string in one operation. + str.chars() + .flat_map(|c| match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) { + Ok(i) => Either::Left(OUTPUT_REPLACEMENTS[i].1.chars()), + _ => Either::Right([c].into_iter()), + }) + .collect() } fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) { From 3945480ce7b93eefc2fec90bb87bc53a19e30f96 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Esteban=20K=C3=BCber?= Date: Mon, 29 Jul 2024 18:35:11 +0000 Subject: [PATCH 2/3] Use `fold` instead of `flat_map` --- Cargo.lock | 1 - compiler/rustc_errors/Cargo.toml | 1 - compiler/rustc_errors/src/emitter.rs | 16 ++++++++-------- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 43fbaf966b79f..281599a21fc14 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3867,7 +3867,6 @@ version = "0.0.0" dependencies = [ "annotate-snippets 0.10.2", "derive_setters", - "either", "rustc_ast", "rustc_ast_pretty", "rustc_data_structures", diff --git a/compiler/rustc_errors/Cargo.toml b/compiler/rustc_errors/Cargo.toml index ddf72a2d5fdb2..2fff9f2de50fb 100644 --- a/compiler/rustc_errors/Cargo.toml +++ b/compiler/rustc_errors/Cargo.toml @@ -7,7 +7,6 @@ edition = "2021" # tidy-alphabetical-start annotate-snippets = "0.10" derive_setters = "0.1.6" -either = "1.5.0" rustc_ast = { path = "../rustc_ast" } rustc_ast_pretty = { path = "../rustc_ast_pretty" } rustc_data_structures = { path = "../rustc_data_structures" } diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs index d186996040bd2..18e077ed8a22a 100644 --- a/compiler/rustc_errors/src/emitter.rs +++ b/compiler/rustc_errors/src/emitter.rs @@ -16,7 +16,6 @@ use std::iter; use std::path::Path; use derive_setters::Setters; -use either::Either; use rustc_data_structures::fx::{FxHashMap, FxIndexMap, FxIndexSet}; use rustc_data_structures::sync::{DynSend, IntoDynSyncSend, Lrc}; use rustc_error_messages::{FluentArgs, SpanLabel}; @@ -2609,16 +2608,17 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ ('\u{2069}', "�"), ]; -fn normalize_whitespace(str: &str) -> String { +fn normalize_whitespace(s: &str) -> String { // Scan the input string for a character in the ordered table above. If it's present, replace // it with it's alternative string (it can be more than 1 char!). Otherwise, retain the input // char. At the end, allocate all chars into a string in one operation. - str.chars() - .flat_map(|c| match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) { - Ok(i) => Either::Left(OUTPUT_REPLACEMENTS[i].1.chars()), - _ => Either::Right([c].into_iter()), - }) - .collect() + s.chars().fold(String::with_capacity(s.len()), |mut s, c| { + match OUTPUT_REPLACEMENTS.binary_search_by_key(&c, |(k, _)| *k) { + Ok(i) => s.push_str(OUTPUT_REPLACEMENTS[i].1), + _ => s.push(c), + } + s + }) } fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) { From 51b5bb179884d71d4758e05c7913c26f3d03385c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Esteban=20K=C3=BCber?= Date: Tue, 30 Jul 2024 00:18:38 +0000 Subject: [PATCH 3/3] Enforce sort order --- compiler/rustc_errors/src/emitter.rs | 64 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs index 18e077ed8a22a..8b0a3888eb78f 100644 --- a/compiler/rustc_errors/src/emitter.rs +++ b/compiler/rustc_errors/src/emitter.rs @@ -2561,41 +2561,42 @@ fn num_decimal_digits(num: usize) -> usize { // Keep the following list in sync with `rustc_span::char_width`. // ATTENTION: keep lexicografically sorted so that the binary search will work const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ + // tidy-alphabetical-start // In terminals without Unicode support the following will be garbled, but in *all* terminals // the underlying codepoint will be as well. We could gate this replacement behind a "unicode // support" gate. ('\0', "␀"), - ('\u{1}', "␁"), - ('\u{2}', "␂"), - ('\u{3}', "␃"), - ('\u{4}', "␄"), - ('\u{5}', "␅"), - ('\u{6}', "␆"), - ('\u{7}', "␇"), - ('\u{8}', "␈"), - ('\t', " "), // We do our own tab replacement - ('\u{b}', "␋"), - ('\u{c}', "␌"), - ('\r', "␍"), - ('\u{e}', "␎"), - ('\u{f}', "␏"), - ('\u{10}', "␐"), - ('\u{11}', "␑"), - ('\u{12}', "␒"), - ('\u{13}', "␓"), - ('\u{14}', "␔"), - ('\u{15}', "␕"), - ('\u{16}', "␖"), - ('\u{17}', "␗"), - ('\u{18}', "␘"), - ('\u{19}', "␙"), - ('\u{1a}', "␚"), - ('\u{1b}', "␛"), - ('\u{1c}', "␜"), - ('\u{1d}', "␝"), - ('\u{1e}', "␞"), - ('\u{1f}', "␟"), - ('\u{7f}', "␡"), + ('\u{0001}', "␁"), + ('\u{0002}', "␂"), + ('\u{0003}', "␃"), + ('\u{0004}', "␄"), + ('\u{0005}', "␅"), + ('\u{0006}', "␆"), + ('\u{0007}', "␇"), + ('\u{0008}', "␈"), + ('\u{0009}', " "), // We do our own tab replacement + ('\u{000b}', "␋"), + ('\u{000c}', "␌"), + ('\u{000d}', "␍"), + ('\u{000e}', "␎"), + ('\u{000f}', "␏"), + ('\u{0010}', "␐"), + ('\u{0011}', "␑"), + ('\u{0012}', "␒"), + ('\u{0013}', "␓"), + ('\u{0014}', "␔"), + ('\u{0015}', "␕"), + ('\u{0016}', "␖"), + ('\u{0017}', "␗"), + ('\u{0018}', "␘"), + ('\u{0019}', "␙"), + ('\u{001a}', "␚"), + ('\u{001b}', "␛"), + ('\u{001c}', "␜"), + ('\u{001d}', "␝"), + ('\u{001e}', "␞"), + ('\u{001f}', "␟"), + ('\u{007f}', "␡"), ('\u{200d}', ""), // Replace ZWJ for consistent terminal output of grapheme clusters. ('\u{202a}', "�"), // The following unicode text flow control characters are inconsistently ('\u{202b}', "�"), // supported across CLIs and can cause confusion due to the bytes on disk @@ -2606,6 +2607,7 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ ('\u{2067}', "�"), ('\u{2068}', "�"), ('\u{2069}', "�"), + // tidy-alphabetical-end ]; fn normalize_whitespace(s: &str) -> String {