Replace LALRPOP parser with hand-written parser (#10036)

(Supersedes #9152, authored by @LaBatata101) ## Summary This PR replaces the current parser generated from LALRPOP to a hand-written recursive descent parser. It also updates the grammar for [PEP 646](https://peps.python.org/pep-0646/) so that the parser outputs the correct AST. For example, in `data[*x]`, the index expression is now a tuple with a single starred expression instead of just a starred expression. Beyond the performance improvements, the parser is also error resilient and can provide better error messages. The behavior as seen by any downstream tools isn't changed. That is, the linter and formatter can still assume that the parser will _stop_ at the first syntax error. This will be updated in the following months. For more details about the change here, refer to the PR corresponding to the individual commits and the release blog post. ## Test Plan Write _lots_ and _lots_ of tests for both valid and invalid syntax and verify the output. ## Acknowledgements - @MichaReiser for reviewing 100+ parser PRs and continuously providing guidance throughout the project - @LaBatata101 for initiating the transition to a hand-written parser in #9152 - @addisoncrump for implementing the fuzzer which helped [catch](https://github.com/astral-sh/ruff/pull/10903) [a](https://github.com/astral-sh/ruff/pull/10910) [lot](https://github.com/astral-sh/ruff/pull/10966) [of](https://github.com/astral-sh/ruff/pull/10896) [bugs](https://github.com/astral-sh/ruff/pull/10877) --------- Co-authored-by: Victor Hugo Gomes <labatata101@linuxmail.org> Co-authored-by: Micha Reiser <micha@reiser.io>
2025-07-16 01:25:22 +00:00 · 2024-04-18 17:57:39 +05:30 · 2024-04-18 17:57:39 +05:30 · 13ffb5bc19
commit 13ffb5bc19
parent e09180b1df
852 changed files with 112948 additions and 103620 deletions
--- a/crates/ruff_python_parser/resources/invalid/statements/function_type_parameters.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/function_type_parameters.py
@ -0,0 +1,19 @@
+# FIXME: The type param related error message and the parser recovery are looking pretty good **except**
+# that the lexer never recovers from the unclosed `[`, resulting in it lexing `NonLogicalNewline` tokens instead of `Newline` tokens.
+# That's because the parser has no way of feeding the error recovery back to the lexer,
+# so they don't agree on the state of the world which can lead to all kind of errors further down in the file.
+# This is not just a problem with parentheses but also with the transformation made by the
+# `SoftKeywordTransformer` because the `Parser` and `Transfomer` may not agree if they're
+# currently in a position where the `type` keyword is allowed or not.
+# That roughly means that any kind of recovery can lead to unrelated syntax errors
+# on following lines.
+
+def keyword[A, await](): ...
+
+def not_a_type_param[A, |, B](): ...
+
+def multiple_commas[A,,B](): ...
+
+def multiple_trailing_commas[A,,](): ...
+
+def multiple_commas_and_recovery[A,,100](): ...
--- a/crates/ruff_python_parser/resources/invalid/statements/if_extra_closing_parentheses.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/if_extra_closing_parentheses.py
@ -0,0 +1,3 @@
+# FIXME(micha): This creates two syntax errors instead of just one (and overlapping ones)
+if True)):
+    pass
--- a/crates/ruff_python_parser/resources/invalid/statements/if_extra_indent.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/if_extra_indent.py
@ -0,0 +1,8 @@
+# Improving the recovery would require changing the lexer to emit an extra dedent token after `a + b`.
+if True:
+    pass
+        a + b
+
+    pass
+
+a = 10
--- a/crates/ruff_python_parser/resources/invalid/statements/invalid_assignment_targets.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/invalid_assignment_targets.py
@ -0,0 +1,42 @@
+# Regression test: https://github.com/astral-sh/ruff/issues/6895
+# First we test, broadly, that various kinds of assignments are now
+# rejected by the parser. e.g., `5 = 3`, `5 += 3`, `(5): int = 3`.
+
+5 = 3
+
+5 += 3
+
+(5): int = 3
+
+# Now we exhaustively test all possible cases where assignment can fail.
+x or y = 42
+(x := 5) = 42
+x + y = 42
+-x = 42
+(lambda _: 1) = 42
+a if b else c = 42
+{"a": 5} = 42
+{a} = 42
+[x for x in xs] = 42
+{x for x in xs} = 42
+{x: x * 2 for x in xs} = 42
+(x for x in xs) = 42
+await x = 42
+(yield x) = 42
+(yield from xs) = 42
+a < b < c = 42
+foo() = 42
+
+f"{quux}" = 42
+f"{foo} and {bar}" = 42
+
+"foo" = 42
+b"foo" = 42
+123 = 42
+True = 42
+None = 42
+... = 42
+*foo() = 42
+[x, foo(), y] = [42, 42, 42]
+[[a, b], [[42]], d] = [[1, 2], [[3]], 4]
+(x, foo(), y) = (42, 42, 42)
--- a/crates/ruff_python_parser/resources/invalid/statements/invalid_augmented_assignment_target.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/invalid_augmented_assignment_target.py
@ -0,0 +1,34 @@
+# This is similar to `./invalid_assignment_targets.py`, but for augmented
+# assignment targets.
+
+x or y += 42
+(x := 5) += 42
+x + y += 42
+-x += 42
+(lambda _: 1) += 42
+a if b else c += 42
+{"a": 5} += 42
+{a} += 42
+[x for x in xs] += 42
+{x for x in xs} += 42
+{x: x * 2 for x in xs} += 42
+(x for x in xs) += 42
+await x += 42
+(yield x) += 42
+(yield from xs) += 42
+a < b < c += 42
+foo() += 42
+
+f"{quux}" += 42
+f"{foo} and {bar}" += 42
+
+"foo" += 42
+b"foo" += 42
+123 += 42
+True += 42
+None += 42
+... += 42
+*foo() += 42
+[x, foo(), y] += [42, 42, 42]
+[[a, b], [[42]], d] += [[1, 2], [[3]], 4]
+(x, foo(), y) += (42, 42, 42)
--- a/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_0.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_0.py
@ -0,0 +1,8 @@
+match subject:
+    #            Parser shouldn't confuse this as being a
+    #            class pattern
+    #            v
+    case (x as y)(a, b):
+    #     ^^^^^^
+    #    as-pattern
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_1.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_1.py
@ -0,0 +1,8 @@
+match subject:
+    #             Parser shouldn't confuse this as being a
+    #             complex literal pattern
+    #             v
+    case (x as y) + 1j:
+    #     ^^^^^^
+    #    as-pattern
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_2.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_2.py
@ -0,0 +1,5 @@
+match subject:
+    # This `as` pattern is unparenthesied so the parser never takes the path
+    # where it might be confused as a complex literal pattern.
+    case x as y + 1j:
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_3.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_3.py
@ -0,0 +1,5 @@
+match subject:
+    #     Not in the mapping start token set, so the list parsing bails
+    #     v
+    case {(x as y): 1}:
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_4.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/as_pattern_4.py
@ -0,0 +1,5 @@
+match subject:
+    # This `as` pattern is unparenthesized so the parser never takes the path
+    # where it might be confused as a mapping key pattern.
+    case {x as y: 1}:
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/invalid_class_pattern.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/invalid_class_pattern.py
@ -0,0 +1,17 @@
+# Invalid keyword pattern in class argument
+match subject:
+    case Foo(x as y = 1):
+        pass
+    case Foo(x | y = 1):
+        pass
+    case Foo([x, y] = 1):
+        pass
+    case Foo({False: 0} = 1):
+        pass
+    case Foo(1=1):
+        pass
+    case Foo(Bar()=1):
+        pass
+    # Positional pattern cannot follow keyword pattern
+    # case Foo(x, y=1, z):
+    #     pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/invalid_lhs_or_rhs_pattern.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/invalid_lhs_or_rhs_pattern.py
@ -0,0 +1,41 @@
+match invalid_lhs_pattern:
+    case Foo() + 1j:
+        pass
+    case x + 2j:
+        pass
+    case _ + 3j:
+        pass
+    case (1 | 2) + 4j:
+        pass
+    case [1, 2] + 5j:
+        pass
+    case {True: 1} + 6j:
+        pass
+    case 1j + 2j:
+        pass
+    case -1j + 2j:
+        pass
+    case Foo(a as b) + 1j:
+        pass
+
+match invalid_rhs_pattern:
+    case 1 + Foo():
+        pass
+    case 2 + x:
+        pass
+    case 3 + _:
+        pass
+    case 4 + (1 | 2):
+        pass
+    case 5 + [1, 2]:
+        pass
+    case 6 + {True: 1}:
+        pass
+    case 1 + 2:
+        pass
+    case 1 + Foo(a as b):
+        pass
+
+match invalid_lhs_rhs_pattern:
+    case Foo() + Bar():
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/invalid_mapping_pattern.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/invalid_mapping_pattern.py
@ -0,0 +1,23 @@
+# Starred expression is not allowed as a mapping pattern key
+match subject:
+    case {*key}:
+        pass
+    case {*key: 1}:
+        pass
+    case {*key 1}:
+        pass
+    case {*key, None: 1}:
+        pass
+
+# Pattern cannot follow a double star pattern
+# Multiple double star patterns are not allowed
+match subject:
+    case {**rest, None: 1}:
+        pass
+    case {**rest1, **rest2, None: 1}:
+        pass
+    case {**rest1, None: 1, **rest2}:
+        pass
+
+match subject:
+    case {Foo(a as b): 1}: ...
--- a/crates/ruff_python_parser/resources/invalid/statements/match/star_pattern_usage.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/star_pattern_usage.py
@ -0,0 +1,24 @@
+# Star pattern is only allowed inside a sequence pattern
+match subject:
+    case *_:
+        pass
+    case *_ as x:
+        pass
+    case *foo:
+        pass
+    case *foo | 1:
+        pass
+    case 1 | *foo:
+        pass
+    case Foo(*_):
+        pass
+    case Foo(x=*_):
+        pass
+    case {*_}:
+        pass
+    case {*_: 1}:
+        pass
+    case {None: *_}:
+        pass
+    case 1 + *_:
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/match/unary_add_usage.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/match/unary_add_usage.py
@ -0,0 +1,12 @@
+# Unary addition isn't allowed but we parse it for better error recovery.
+match subject:
+    case +1:
+        pass
+    case 1 | +2 | -3:
+        pass
+    case [1, +2, -3]:
+        pass
+    case Foo(x=+1, y=-2):
+        pass
+    case {True: +1, False: -2}:
+        pass
--- a/crates/ruff_python_parser/resources/invalid/statements/with/ambiguous_lpar_with_items.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/with/ambiguous_lpar_with_items.py
@ -0,0 +1,32 @@
+# This file contains test cases where the with items has an ambiguous left parenthesis.
+# These cases should raise the correct syntax error and recover properly.
+
+with (item1, item2),: ...
+with (item1, item2), as f: ...
+with (item1, item2), item3,: ...
+with (*item): ...
+with (*item) as f: ...
+with (item := 10 as f): ...
+with (item1, item2 := 10 as f): ...
+with (x for x in range(10), item): ...
+with (item, x for x in range(10)): ...
+
+# Make sure the parser doesn't report the same error twice
+with ((*item)): ...
+
+with (*x for x in iter, item): ...
+with (item1, *x for x in iter, item2): ...
+with (x as f, *y): ...
+with (*x, y as f): ...
+with (x, yield y): ...
+with (x, yield y, z): ...
+with (x, yield from y): ...
+with (x as f, y) as f: ...
+with (x for x in iter as y): ...
+
+# The inner `(...)` is parsed as parenthesized expression
+with ((item as f)): ...
+
+with (item as f), x: ...
+with (item as f1) as f2: ...
+with (item1 as f, item2 := 0): ...
--- a/crates/ruff_python_parser/resources/invalid/statements/with/empty_with_items.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/with/empty_with_items.py
@ -0,0 +1,6 @@
+# There are no with items present.
+# The parser should recover from this syntax error.
+
+with : ...
+
+x + y
--- a/crates/ruff_python_parser/resources/invalid/statements/with/unclosed_ambiguous_lpar.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/with/unclosed_ambiguous_lpar.py
@ -0,0 +1,3 @@
+with (:
+
+x + y
--- a/crates/ruff_python_parser/resources/invalid/statements/with/unclosed_ambiguous_lpar_eof.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/with/unclosed_ambiguous_lpar_eof.py
@ -0,0 +1 @@
+with (
--- a/crates/ruff_python_parser/resources/invalid/statements/with/unparenthesized_with_items.py
+++ b/crates/ruff_python_parser/resources/invalid/statements/with/unparenthesized_with_items.py
@ -0,0 +1,9 @@
+# For parenthesized with items test cases, refer to `./ambiguous_lpar_with_items.py`
+
+with item,: pass
+with item as x,: pass
+with *item: pass
+with *item as x: pass
+with *item1, item2 as f: pass
+with item1 as f, *item2: pass
+with item := 0 as f: pass