From dad95578b0f000807f23fe2d37ca37ea8bd8522c Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 12 Jul 2024 09:22:16 +1000 Subject: [PATCH 1/3] Add unit tests for `Parser::look_ahead`. It's currently buggy, so some of the test results are surprising, as described in the `FIXME` comments. The bugs will be fixed in subsequent commits. --- compiler/rustc_parse/src/parser/tests.rs | 116 +++++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/compiler/rustc_parse/src/parser/tests.rs b/compiler/rustc_parse/src/parser/tests.rs index 42392ad2163d8..538ec6b8cfd45 100644 --- a/compiler/rustc_parse/src/parser/tests.rs +++ b/compiler/rustc_parse/src/parser/tests.rs @@ -1376,6 +1376,122 @@ fn ttdelim_span() { }); } +// Uses a macro rather than a function so that failure messages mention the +// correct line in the test function. +macro_rules! look { + ($p:ident, $dist:literal, $kind:expr) => { + $p.look_ahead($dist, |tok| assert_eq!($kind, tok.kind)); + }; +} + +#[test] +fn look_ahead() { + create_default_session_globals_then(|| { + let sym_f = Symbol::intern("f"); + let sym_x = Symbol::intern("x"); + #[allow(non_snake_case)] + let sym_S = Symbol::intern("S"); + let raw_no = IdentIsRaw::No; + + let psess = psess(); + let mut p = string_to_parser(&psess, "fn f(x: u32) { x } struct S;".to_string()); + + // Current position is the `fn`. + look!(p, 0, token::Ident(kw::Fn, raw_no)); + look!(p, 1, token::Ident(sym_f, raw_no)); + look!(p, 2, token::OpenDelim(Delimiter::Parenthesis)); + look!(p, 3, token::Ident(sym_x, raw_no)); + look!(p, 4, token::Colon); + look!(p, 5, token::Ident(sym::u32, raw_no)); + look!(p, 6, token::CloseDelim(Delimiter::Parenthesis)); + look!(p, 7, token::OpenDelim(Delimiter::Brace)); + look!(p, 8, token::Ident(sym_x, raw_no)); + look!(p, 9, token::CloseDelim(Delimiter::Brace)); + look!(p, 10, token::Ident(kw::Struct, raw_no)); + look!(p, 11, token::Ident(sym_S, raw_no)); + look!(p, 12, token::Semi); + // Any lookahead past the end of the token stream returns `Eof`. + look!(p, 13, token::Eof); + look!(p, 14, token::Eof); + look!(p, 15, token::Eof); + look!(p, 100, token::Eof); + + // Move forward to the first `x`. + for _ in 0..3 { + p.bump(); + } + look!(p, 0, token::Ident(sym_x, raw_no)); + look!(p, 1, token::Colon); + look!(p, 2, token::Ident(sym::u32, raw_no)); + look!(p, 3, token::CloseDelim(Delimiter::Parenthesis)); + // FIXME(nnethercote) If we lookahead any distance past a close delim + // we currently return that close delim. + look!(p, 4, token::CloseDelim(Delimiter::Parenthesis)); + look!(p, 5, token::CloseDelim(Delimiter::Parenthesis)); + look!(p, 6, token::CloseDelim(Delimiter::Parenthesis)); + look!(p, 100, token::CloseDelim(Delimiter::Parenthesis)); + + // Move forward to the `;`. + for _ in 0..9 { + p.bump(); + } + look!(p, 0, token::Semi); + // Any lookahead past the end of the token stream returns `Eof`. + look!(p, 1, token::Eof); + look!(p, 100, token::Eof); + + // Move one past the `;`, i.e. past the end of the token stream. + p.bump(); + look!(p, 0, token::Eof); + look!(p, 1, token::Eof); + look!(p, 100, token::Eof); + + // Bumping after Eof is idempotent. + p.bump(); + look!(p, 0, token::Eof); + look!(p, 1, token::Eof); + look!(p, 100, token::Eof); + }); +} + +/// FIXME(nnethercote) Currently there is some buggy behaviour when using +/// `look_ahead` not within the outermost token stream, as this test shows. +#[test] +fn look_ahead_non_outermost_stream() { + create_default_session_globals_then(|| { + let sym_f = Symbol::intern("f"); + #[allow(non_snake_case)] + let sym_S = Symbol::intern("S"); + let raw_no = IdentIsRaw::No; + + let psess = psess(); + let mut p = string_to_parser(&psess, "mod m { fn f(x: u32) { x } struct S; }".to_string()); + + // Move forward to the `fn`, which is not within the outermost token + // stream (because it's inside the `mod { ... }`). + for _ in 0..3 { + p.bump(); + } + look!(p, 0, token::Ident(kw::Fn, raw_no)); + look!(p, 1, token::Ident(sym_f, raw_no)); + look!(p, 2, token::OpenDelim(Delimiter::Parenthesis)); + // FIXME(nnethercote) The current code incorrectly skips the `x: u32)` + // to the next token tree. + look!(p, 3, token::OpenDelim(Delimiter::Brace)); + // FIXME(nnethercote) The current code incorrectly skips the `x }` + // to the next token tree. + look!(p, 4, token::Ident(kw::Struct, raw_no)); + look!(p, 5, token::Ident(sym_S, raw_no)); + look!(p, 6, token::Semi); + // FIXME(nnethercote) If we lookahead any distance past a close delim + // we currently return that close delim. + look!(p, 7, token::CloseDelim(Delimiter::Brace)); + look!(p, 8, token::CloseDelim(Delimiter::Brace)); + look!(p, 9, token::CloseDelim(Delimiter::Brace)); + look!(p, 100, token::CloseDelim(Delimiter::Brace)); + }); +} + // This tests that when parsing a string (rather than a file) we don't try // and read in a file for a module declaration and just parse a stub. // See `recurse_into_file_modules` in the parser. From ebe1305b1e0bb32913b309ce65bd97106532ad6a Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 12 Jul 2024 12:56:58 +1000 Subject: [PATCH 2/3] Remove the bogus special case from `Parser::look_ahead`. The general case at the bottom of `look_ahead` is slow, because it clones the token cursor. Above it there is a special case for performance that is hit most of the time and avoids the cloning. Unfortunately, its behaviour differs from the general case in two ways. - When within a pair of delimiters, if you look any distance past the closing delimiter you get the closing delimiter instead of what comes after the closing delimiter. - It uses `tree_cursor.look_ahead(dist - 1)` which totally confuses tokens with token trees. This means that only the first token in a token tree will be seen. E.g. in a sequence like `{ a }` the `a` and `}` will be skipped over. Bad! It's likely that these differences weren't noticed before now because the use of `look_ahead` in the parser is limited to small distances and relatively few contexts. Removing the special case causes slowdowns up of to 2% on a range of benchmarks. The next commit will add a new, correct special case to regain that lost performance. --- compiler/rustc_parse/src/parser/mod.rs | 37 +------------------ compiler/rustc_parse/src/parser/tests.rs | 47 +++++++++++++----------- 2 files changed, 28 insertions(+), 56 deletions(-) diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs index 45ca267fe5d1e..f906a2ecab7a8 100644 --- a/compiler/rustc_parse/src/parser/mod.rs +++ b/compiler/rustc_parse/src/parser/mod.rs @@ -1118,41 +1118,8 @@ impl<'a> Parser<'a> { return looker(&self.token); } - if let Some(&(_, span, _, delim)) = self.token_cursor.stack.last() - && delim != Delimiter::Invisible - { - // We are not in the outermost token stream, and the token stream - // we are in has non-skipped delimiters. Look for skipped - // delimiters in the lookahead range. - let tree_cursor = &self.token_cursor.tree_cursor; - let all_normal = (0..dist).all(|i| { - let token = tree_cursor.look_ahead(i); - !matches!(token, Some(TokenTree::Delimited(.., Delimiter::Invisible, _))) - }); - if all_normal { - // There were no skipped delimiters. Do lookahead by plain indexing. - return match tree_cursor.look_ahead(dist - 1) { - Some(tree) => { - // Indexing stayed within the current token stream. - match tree { - TokenTree::Token(token, _) => looker(token), - TokenTree::Delimited(dspan, _, delim, _) => { - looker(&Token::new(token::OpenDelim(*delim), dspan.open)) - } - } - } - None => { - // Indexing went past the end of the current token - // stream. Use the close delimiter, no matter how far - // ahead `dist` went. - looker(&Token::new(token::CloseDelim(delim), span.close)) - } - }; - } - } - - // We are in a more complex case. Just clone the token cursor and use - // `next`, skipping delimiters as necessary. Slow but simple. + // Just clone the token cursor and use `next`, skipping delimiters as + // necessary. Slow but simple. let mut cursor = self.token_cursor.clone(); let mut i = 0; let mut token = Token::dummy(); diff --git a/compiler/rustc_parse/src/parser/tests.rs b/compiler/rustc_parse/src/parser/tests.rs index 538ec6b8cfd45..5b2d119e42b45 100644 --- a/compiler/rustc_parse/src/parser/tests.rs +++ b/compiler/rustc_parse/src/parser/tests.rs @@ -1424,12 +1424,15 @@ fn look_ahead() { look!(p, 1, token::Colon); look!(p, 2, token::Ident(sym::u32, raw_no)); look!(p, 3, token::CloseDelim(Delimiter::Parenthesis)); - // FIXME(nnethercote) If we lookahead any distance past a close delim - // we currently return that close delim. - look!(p, 4, token::CloseDelim(Delimiter::Parenthesis)); - look!(p, 5, token::CloseDelim(Delimiter::Parenthesis)); - look!(p, 6, token::CloseDelim(Delimiter::Parenthesis)); - look!(p, 100, token::CloseDelim(Delimiter::Parenthesis)); + look!(p, 4, token::OpenDelim(Delimiter::Brace)); + look!(p, 5, token::Ident(sym_x, raw_no)); + look!(p, 6, token::CloseDelim(Delimiter::Brace)); + look!(p, 7, token::Ident(kw::Struct, raw_no)); + look!(p, 8, token::Ident(sym_S, raw_no)); + look!(p, 9, token::Semi); + look!(p, 10, token::Eof); + look!(p, 11, token::Eof); + look!(p, 100, token::Eof); // Move forward to the `;`. for _ in 0..9 { @@ -1454,12 +1457,13 @@ fn look_ahead() { }); } -/// FIXME(nnethercote) Currently there is some buggy behaviour when using -/// `look_ahead` not within the outermost token stream, as this test shows. +/// There used to be some buggy behaviour when using `look_ahead` not within +/// the outermost token stream, which this test covers. #[test] fn look_ahead_non_outermost_stream() { create_default_session_globals_then(|| { let sym_f = Symbol::intern("f"); + let sym_x = Symbol::intern("x"); #[allow(non_snake_case)] let sym_S = Symbol::intern("S"); let raw_no = IdentIsRaw::No; @@ -1475,20 +1479,21 @@ fn look_ahead_non_outermost_stream() { look!(p, 0, token::Ident(kw::Fn, raw_no)); look!(p, 1, token::Ident(sym_f, raw_no)); look!(p, 2, token::OpenDelim(Delimiter::Parenthesis)); - // FIXME(nnethercote) The current code incorrectly skips the `x: u32)` - // to the next token tree. - look!(p, 3, token::OpenDelim(Delimiter::Brace)); - // FIXME(nnethercote) The current code incorrectly skips the `x }` - // to the next token tree. - look!(p, 4, token::Ident(kw::Struct, raw_no)); - look!(p, 5, token::Ident(sym_S, raw_no)); - look!(p, 6, token::Semi); - // FIXME(nnethercote) If we lookahead any distance past a close delim - // we currently return that close delim. - look!(p, 7, token::CloseDelim(Delimiter::Brace)); - look!(p, 8, token::CloseDelim(Delimiter::Brace)); + look!(p, 3, token::Ident(sym_x, raw_no)); + look!(p, 4, token::Colon); + look!(p, 5, token::Ident(sym::u32, raw_no)); + look!(p, 6, token::CloseDelim(Delimiter::Parenthesis)); + look!(p, 7, token::OpenDelim(Delimiter::Brace)); + look!(p, 8, token::Ident(sym_x, raw_no)); look!(p, 9, token::CloseDelim(Delimiter::Brace)); - look!(p, 100, token::CloseDelim(Delimiter::Brace)); + look!(p, 10, token::Ident(kw::Struct, raw_no)); + look!(p, 11, token::Ident(sym_S, raw_no)); + look!(p, 12, token::Semi); + look!(p, 13, token::CloseDelim(Delimiter::Brace)); + // Any lookahead past the end of the token stream returns `Eof`. + look!(p, 14, token::Eof); + look!(p, 15, token::Eof); + look!(p, 100, token::Eof); }); } From 100f3fd133d928e9c86cff202fa32e5e4d0ce6c7 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Fri, 12 Jul 2024 13:20:24 +1000 Subject: [PATCH 3/3] Add a new special case to `Parser::look_ahead`. This new special case is simpler than the old special case because it only is used when `dist == 1`. But that's still enough to cover ~98% of cases. This results in equivalent performance to the old special case, and identical behaviour as the general case. --- compiler/rustc_parse/src/parser/mod.rs | 29 ++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/compiler/rustc_parse/src/parser/mod.rs b/compiler/rustc_parse/src/parser/mod.rs index f906a2ecab7a8..ef9b3aabc61ca 100644 --- a/compiler/rustc_parse/src/parser/mod.rs +++ b/compiler/rustc_parse/src/parser/mod.rs @@ -1118,6 +1118,35 @@ impl<'a> Parser<'a> { return looker(&self.token); } + // Typically around 98% of the `dist > 0` cases have `dist == 1`, so we + // have a fast special case for that. + if dist == 1 { + // The index is zero because the tree cursor's index always points + // to the next token to be gotten. + match self.token_cursor.tree_cursor.look_ahead(0) { + Some(tree) => { + // Indexing stayed within the current token tree. + return match tree { + TokenTree::Token(token, _) => looker(token), + TokenTree::Delimited(dspan, _, delim, _) => { + looker(&Token::new(token::OpenDelim(*delim), dspan.open)) + } + }; + } + None => { + // The tree cursor lookahead went (one) past the end of the + // current token tree. Try to return a close delimiter. + if let Some(&(_, span, _, delim)) = self.token_cursor.stack.last() + && delim != Delimiter::Invisible + { + // We are not in the outermost token stream, so we have + // delimiters. Also, those delimiters are not skipped. + return looker(&Token::new(token::CloseDelim(delim), span.close)); + } + } + } + } + // Just clone the token cursor and use `next`, skipping delimiters as // necessary. Slow but simple. let mut cursor = self.token_cursor.clone();