Add support for match statements to parser

2025-07-19 11:05:45 +00:00 · 2023-02-17 22:09:18 -05:00 · 2023-02-17 22:09:18 -05:00 · 2b43d45bd5
commit 2b43d45bd5
parent 4bdc2d47c1
8 changed files with 10815 additions and 4 deletions
--- a/parser/src/lib.rs
+++ b/parser/src/lib.rs
@ -131,4 +131,5 @@ mod string;
 #[rustfmt::skip]
 mod python;
 mod context;
+mod soft_keywords;
 pub mod token;
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -18,6 +18,7 @@ use crate::{ast, error::ParseError, lexer, python};
 use ast::Location;
 use itertools::Itertools;
 use std::iter;
+use crate::soft_keywords::soft_keywords;

 /// Parse a full Python program usually consisting of multiple lines.
 ///  
@ -188,9 +189,8 @@ pub fn parse_tokens(
    let tokenizer = iter::once(Ok(marker_token))
        .chain(lxr)
        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
-
    python::TopParser::new()
-        .parse(tokenizer)
+        .parse(soft_keywords(tokenizer, mode).into_iter())
        .map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path))
 }

@ -419,4 +419,205 @@ with (0 as a, 1 as b,): pass
        assert!(parse(&source, Mode::Module, "<embedded>").is_ok());
        assert!(parse(&source, Mode::Interactive, "<embedded>").is_ok());
    }
+
+    #[test]
+    fn test_match_as_identifier() {
+        let parse_ast = parse_program(
+            r#"
+match *a + b, c   # ((match * a) + b), c
+match *(a + b), c   # (match * (a + b)), c
+match (*a + b, c)   # match ((*(a + b)), c)
+match -a * b + c   # (match - (a * b)) + c
+match -(a * b) + c   # (match - (a * b)) + c
+match (-a) * b + c   # (match (-(a * b))) + c
+match ().a   # (match()).a
+match (()).a   # (match(())).a
+match ((),).a   # (match(())).a
+match [a].b   # (match[a]).b
+match [a,].b   # (match[(a,)]).b  (not (match[a]).b)
+match [(a,)].b   # (match[(a,)]).b
+match()[a:
+    b]  # (match())[a: b]
+if match := 1: pass
+match match:
+    case 1: pass
+    case 2:
+        pass
+"#,
+            "<test>",
+        )
+        .unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
+
+    #[test]
+    fn test_match_complex() {
+        let source = r#"# Cases sampled from Lib/test/test_patma.py
+
+# case test_patma_098
+match x:
+    case -0j:
+        y = 0
+# case test_patma_142
+match x:
+    case bytes(z):
+        y = 0
+# case test_patma_073
+match x:
+    case 0 if 0:
+        y = 0
+    case 0 if 1:
+        y = 1
+# case test_patma_006
+match 3:
+    case 0 | 1 | 2 | 3:
+        x = True
+# case test_patma_049
+match x:
+    case [0, 1] | [1, 0]:
+        y = 0
+# case black_check_sequence_then_mapping
+match x:
+    case [*_]:
+        return "seq"
+    case {}:
+        return "map"
+# case test_patma_035
+match x:
+    case {0: [1, 2, {}]}:
+        y = 0
+    case {0: [1, 2, {}] | True} | {1: [[]]} | {0: [1, 2, {}]} | [] | "X" | {}:
+        y = 1
+    case []:
+        y = 2
+# case test_patma_107
+match x:
+    case 0.25 + 1.75j:
+        y = 0
+# case test_patma_097
+match x:
+    case -0j:
+        y = 0
+# case test_patma_007
+match 4:
+    case 0 | 1 | 2 | 3:
+        x = True
+# case test_patma_154
+match x:
+    case 0 if x:
+        y = 0
+# case test_patma_134
+match x:
+    case {1: 0}:
+        y = 0
+    case {0: 0}:
+        y = 1
+    case {**z}:
+        y = 2
+# case test_patma_185
+match Seq():
+    case [*_]:
+        y = 0
+# case test_patma_063
+match x:
+    case 1:
+        y = 0
+    case 1:
+        y = 1
+# case test_patma_248
+match x:
+    case {"foo": bar}:
+        y = bar
+# case test_patma_019
+match (0, 1, 2):
+    case [0, 1, *x, 2]:
+        y = 0
+# case test_patma_052
+match x:
+    case [0]:
+        y = 0
+    case [1, 0] if (x := x[:0]):
+        y = 1
+    case [1, 0]:
+        y = 2
+# case test_patma_191
+match w:
+    case [x, y, *_]:
+        z = 0
+# case test_patma_110
+match x:
+    case -0.25 - 1.75j:
+        y = 0
+# case test_patma_151
+match (x,):
+    case [y]:
+        z = 0
+# case test_patma_114
+match x:
+    case A.B.C.D:
+        y = 0
+# case test_patma_232
+match x:
+    case None:
+        y = 0
+# case test_patma_058
+match x:
+    case 0:
+        y = 0
+# case test_patma_233
+match x:
+    case False:
+        y = 0
+# case test_patma_078
+match x:
+    case []:
+        y = 0
+    case [""]:
+        y = 1
+    case "":
+        y = 2
+# case test_patma_156
+match x:
+    case z:
+        y = 0
+# case test_patma_189
+match w:
+    case [x, y, *rest]:
+        z = 0
+# case test_patma_042
+match x:
+    case (0 as z) | (1 as z) | (2 as z) if z == x % 2:
+        y = 0
+# case test_patma_034
+match x:
+    case {0: [1, 2, {}]}:
+        y = 0
+    case {0: [1, 2, {}] | False} | {1: [[]]} | {0: [1, 2, {}]} | [] | "X" | {}:
+        y = 1
+    case []:
+        y = 2
+# case test_patma_123
+match (0, 1, 2):
+    case 0, *x:
+        y = 0
+# case test_patma_126
+match (0, 1, 2):
+    case *x, 2,:
+        y = 0
+# case test_patma_151
+match x,:
+    case y,:
+        z = 0
+# case test_patma_152
+match w, x:
+    case y, z:
+        v = 0
+# case test_patma_153
+match w := x,:
+    case y as v,:
+        z = 0
+"#;
+        let parse_ast = parse_program(source, "<test>").unwrap();
+        insta::assert_debug_snapshot!(parse_ast);
+    }
 }
--- a/parser/src/snapshots/rustpython_parserparsertests__match_as_identifier.snap
+++ b/parser/src/snapshots/rustpython_parserparsertests__match_as_identifier.snap
--- a/parser/src/snapshots/rustpython_parserparsertests__match_complex.snap
+++ b/parser/src/snapshots/rustpython_parserparsertests__match_complex.snap
--- a/parser/src/soft_keywords.rs
+++ b/parser/src/soft_keywords.rs
@ -0,0 +1,117 @@
+use crate::lexer::{LexResult, Tok};
+pub use crate::mode::Mode;
+
+/// Collect all tokens from a token stream in a vector.
+fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResult> {
+    let mut tokens: Vec<LexResult> = vec![];
+    for tok in tokenizer {
+        let is_err = tok.is_err();
+        tokens.push(tok);
+        if is_err {
+            break;
+        }
+    }
+    tokens
+}
+
+/// Modify a token stream to accommodate soft keywords (namely, `match` and `case`).
+///
+/// [PEP 634](https://www.python.org/dev/peps/pep-0634/) introduced the `match` and `case` keywords
+/// as soft keywords, meaning that they can be used as identifiers (e.g., variable names) in certain
+/// contexts.
+///
+/// This function modifies a token stream to accommodate this change. In particular, it replaces
+/// `match` and `case` tokens with `identifier` tokens if they are used as identifiers.
+///
+/// Handling soft keywords in this intermediary pass allows us to simplify both the lexer and
+/// parser, as neither of them need to be aware of soft keywords.
+pub fn soft_keywords(
+    tokenizer: impl IntoIterator<Item = LexResult>,
+    mode: Mode,
+) -> Vec<LexResult> {
+    let mut tokenizer: Vec<LexResult> = collect_tokens(tokenizer);
+    let mut start_of_line = matches!(mode, Mode::Module | Mode::Interactive);
+    for i in 0..tokenizer.len() {
+        // If the token is a `match` or `case` token, check if it's used as an identifier.
+        // We assume every `match` or `case` is an identifier unless both of the following
+        // conditions are met:
+        // 1. The token is at the start of a logical line.
+        // 2. The logical line contains a top-level colon (that is, a colon that is not nested
+        //    inside a parenthesized expression, list, or dictionary).
+        // 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
+        //    (This is to avoid treating `match` and `case` as identifiers when annotated with
+        //    type hints.)
+        if tokenizer[i]
+            .as_ref()
+            .map_or(false, |(_, tok, _)| matches!(tok, Tok::Match | Tok::Case))
+        {
+            let is_identifier = {
+                if !start_of_line {
+                    // If the `match` or `case` token is not at the start of a line, it's definitely
+                    // an identifier.
+                    true
+                } else {
+                    //
+                    let mut seen_colon = false;
+                    let mut first = true;
+                    let mut par_count = 0;
+                    let mut sqb_count = 0;
+                    let mut brace_count = 0;
+                    for (_, tok, _) in tokenizer.iter().skip(i + 1).flatten() {
+                        match tok {
+                            Tok::Newline => break,
+                            Tok::Colon if par_count == 0 && sqb_count == 0 && brace_count == 0 => {
+                                if !first {
+                                    seen_colon = true;
+                                }
+                                break;
+                            }
+                            Tok::Lpar => {
+                                par_count += 1;
+                            }
+                            Tok::Rpar => {
+                                par_count -= 1;
+                            }
+                            Tok::Lsqb => {
+                                sqb_count += 1;
+                            }
+                            Tok::Rsqb => {
+                                sqb_count -= 1;
+                            }
+                            Tok::Lbrace => {
+                                brace_count += 1;
+                            }
+                            Tok::Rbrace => {
+                                brace_count -= 1;
+                            }
+                            _ => {}
+                        }
+                        first = false;
+                    }
+                    !seen_colon
+                }
+            };
+            if is_identifier {
+                if let Ok((_, tok, _)) = &mut tokenizer[i] {
+                    if let Tok::Match = tok {
+                        *tok = Tok::Name {
+                            name: "match".to_string(),
+                        };
+                    } else if let Tok::Case = tok {
+                        *tok = Tok::Name {
+                            name: "case".to_string(),
+                        };
+                    }
+                }
+            }
+        }
+        start_of_line = tokenizer[i].as_ref().map_or(false, |(_, tok, _)| {
+            matches!(
+                tok,
+                Tok::StartModule | Tok::StartInteractive | Tok::Newline | Tok::Indent | Tok::Dedent
+            )
+        });
+    }
+
+    tokenizer
+}
--- a/parser/src/token.rs
+++ b/parser/src/token.rs
@ -184,6 +184,8 @@ pub enum Tok {
    Return,
    Try,
    While,
+    Match,
+    Case,
    With,
    Yield,

@ -297,6 +299,8 @@ impl fmt::Display for Tok {
            Return => f.write_str("'return'"),
            Try => f.write_str("'try'"),
            While => f.write_str("'while'"),
+            Match => f.write_str("'match'"),
+            Case => f.write_str("'case'"),
            With => f.write_str("'with'"),
            Yield => f.write_str("'yield'"),
            ColonEqual => f.write_str("':='"),