Implement re-lexing logic for better error recovery (#11845)

## Summary This PR implements the re-lexing logic in the parser. This logic is only applied when recovering from an error during list parsing. The logic is as follows: 1. During list parsing, if an unexpected token is encountered and it detects that an outer context can understand it and thus recover from it, it invokes the re-lexing logic in the lexer 2. This logic first checks if the lexer is in a parenthesized context and returns if it's not. Thus, the logic is a no-op if the lexer isn't in a parenthesized context 3. It then reduces the nesting level by 1. It shouldn't reset it to 0 because otherwise the recovery from nested list parsing will be incorrect 4. Then, it tries to find last newline character going backwards from the current position of the lexer. This avoids any whitespaces but if it encounters any character other than newline or whitespace, it aborts. 5. Now, if there's a newline character, then it needs to be re-lexed in a logical context which means that the lexer needs to emit it as a `Newline` token instead of `NonLogicalNewline`. 6. If the re-lexing gives a different token than the current one, the token source needs to update it's token collection to remove all the tokens which comes after the new current position. It turns out that the list parsing isn't that happy with the results so it requires some re-arranging such that the following two errors are raised correctly: 1. Expected comma 2. Recovery context error For (1), the following scenarios needs to be considered: * Missing comma between two elements * Half parsed element because the grammar doesn't allow it (for example, named expressions) For (2), the following scenarios needs to be considered: 1. If the parser is at a comma which means that there's a missing element otherwise the comma would've been consumed by the first `eat` call above. And, the parser doesn't take the re-lexing route on a comma token. 2. If it's the first element and the current token is not a comma which means that it's an invalid element. resolves: #11640 ## Test Plan - [x] Update existing test snapshots and validate them - [x] Add additional test cases specific to the re-lexing logic and validate the snapshots - [x] Run the fuzzer on 3000+ valid inputs - [x] Run the fuzzer on invalid inputs - [x] Run the parser on various open source projects - [x] Make sure the ecosystem changes are none
2025-10-02 06:42:02 +00:00 · 2024-06-17 12:17:00 +05:30 · 2024-06-17 12:17:00 +05:30 · 8499abfa7f
commit 8499abfa7f
parent 1f654ee729
43 changed files with 1593 additions and 212 deletions
--- a/crates/ruff_python_parser/src/parser/mod.rs
+++ b/crates/ruff_python_parser/src/parser/mod.rs
@ -473,11 +473,6 @@ impl<'src> Parser<'src> {
        loop {
            progress.assert_progressing(self);

-            // The end of file marker ends all lists.
-            if self.at(TokenKind::EndOfFile) {
-                break;
-            }
-
            if recovery_context_kind.is_list_element(self) {
                parse_element(self);
            } else if recovery_context_kind.is_list_terminator(self) {
@ -533,54 +528,96 @@ impl<'src> Parser<'src> {
            .recovery_context
            .union(RecoveryContext::from_kind(recovery_context_kind));

+        let mut first_element = true;
        let mut trailing_comma_range: Option<TextRange> = None;

        loop {
            progress.assert_progressing(self);

-            // The end of file marker ends all lists.
-            if self.at(TokenKind::EndOfFile) {
-                break;
-            }
-
            if recovery_context_kind.is_list_element(self) {
                parse_element(self);

+                // Only unset this when we've completely parsed a single element. This is mainly to
+                // raise the correct error in case the first element isn't valid and the current
+                // token isn't a comma. Without this knowledge, the parser would later expect a
+                // comma instead of raising the context error.
+                first_element = false;
+
                let maybe_comma_range = self.current_token_range();
                if self.eat(TokenKind::Comma) {
                    trailing_comma_range = Some(maybe_comma_range);
                    continue;
                }
                trailing_comma_range = None;
-
-                if recovery_context_kind.is_list_terminator(self) {
-                    break;
-                }
-
-                self.expect(TokenKind::Comma);
-            } else if recovery_context_kind.is_list_terminator(self) {
-                break;
-            } else {
-                // Not a recognised element. Add an error and either skip the token or break
-                // parsing the list if the token is recognised as an element or terminator of an
-                // enclosing list.
-                let error = recovery_context_kind.create_error(self);
-                self.add_error(error, self.current_token_range());
-
-                // Run the error recovery: This also handles the case when an element is missing
-                // between two commas: `a,,b`
-                if self.is_enclosing_list_element_or_terminator() {
-                    break;
-                }
-
-                if self.at(TokenKind::Comma) {
-                    trailing_comma_range = Some(self.current_token_range());
-                } else {
-                    trailing_comma_range = None;
-                }
-
-                self.bump_any();
            }
+
+            // test_ok comma_separated_regular_list_terminator
+            // # The first element is parsed by `parse_list_like_expression` and the comma after
+            // # the first element is expected by `parse_list_expression`
+            // [0]
+            // [0, 1]
+            // [0, 1,]
+            // [0, 1, 2]
+            // [0, 1, 2,]
+            if recovery_context_kind.is_regular_list_terminator(self) {
+                break;
+            }
+
+            // test_err comma_separated_missing_comma_between_elements
+            // # The comma between the first two elements is expected in `parse_list_expression`.
+            // [0, 1 2]
+            if recovery_context_kind.is_list_element(self) {
+                // This is a special case to expect a comma between two elements and should be
+                // checked before running the error recovery. This is because the error recovery
+                // will always run as the parser is currently at a list element.
+                self.expect(TokenKind::Comma);
+                continue;
+            }
+
+            // Run the error recovery: If the token is recognised as an element or terminator of an
+            // enclosing list, then we try to re-lex in the context of a logical line and break out
+            // of list parsing.
+            if self.is_enclosing_list_element_or_terminator() {
+                self.tokens.re_lex_logical_token();
+                break;
+            }
+
+            if first_element || self.at(TokenKind::Comma) {
+                // There are two conditions when we need to add the recovery context error:
+                //
+                // 1. If the parser is at a comma which means that there's a missing element
+                //    otherwise the comma would've been consumed by the first `eat` call above.
+                //    And, the parser doesn't take the re-lexing route on a comma token.
+                // 2. If it's the first element and the current token is not a comma which means
+                //    that it's an invalid element.
+
+                // test_err comma_separated_missing_element_between_commas
+                // [0, 1, , 2]
+
+                // test_err comma_separated_missing_first_element
+                // call(= 1)
+                self.add_error(
+                    recovery_context_kind.create_error(self),
+                    self.current_token_range(),
+                );
+
+                trailing_comma_range = if self.at(TokenKind::Comma) {
+                    Some(self.current_token_range())
+                } else {
+                    None
+                };
+            } else {
+                // Otherwise, there should've been a comma at this position. This could be because
+                // the element isn't consumed completely by `parse_element`.
+
+                // test_err comma_separated_missing_comma
+                // call(**x := 1)
+                self.expect(TokenKind::Comma);
+
+                trailing_comma_range = None;
+            }
+
+            self.bump_any();
        }

        if let Some(trailing_comma_range) = trailing_comma_range {
@ -885,13 +922,32 @@ impl RecoveryContextKind {
    }

    /// Returns `true` if the parser is at a token that terminates the list as per the context.
+    ///
+    /// This token could either end the list or is only present for better error recovery. Refer to
+    /// [`is_regular_list_terminator`] to only check against the former.
+    ///
+    /// [`is_regular_list_terminator`]: RecoveryContextKind::is_regular_list_terminator
    fn is_list_terminator(self, p: &Parser) -> bool {
        self.list_terminator_kind(p).is_some()
    }

+    /// Returns `true` if the parser is at a token that terminates the list as per the context but
+    /// the token isn't part of the error recovery set.
+    fn is_regular_list_terminator(self, p: &Parser) -> bool {
+        matches!(
+            self.list_terminator_kind(p),
+            Some(ListTerminatorKind::Regular)
+        )
+    }
+
    /// Checks the current token the parser is at and returns the list terminator kind if the token
    /// terminates the list as per the context.
    fn list_terminator_kind(self, p: &Parser) -> Option<ListTerminatorKind> {
+        // The end of file marker ends all lists.
+        if p.at(TokenKind::EndOfFile) {
+            return Some(ListTerminatorKind::Regular);
+        }
+
        match self {
            // The parser must consume all tokens until the end
            RecoveryContextKind::ModuleStatements => None,