Use muiltipeek

2025-08-30 07:08:14 +00:00 · 2023-02-19 22:37:00 -05:00 · 2023-02-19 22:37:00 -05:00 · f1f31324d0
commit f1f31324d0
parent 8649bf6f8f
2 changed files with 94 additions and 85 deletions
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -14,11 +14,11 @@
 use crate::lexer::{LexResult, Tok};
 pub use crate::mode::Mode;
 use crate::soft_keywords::SoftKeywordTransformer;
 use crate::{ast, error::ParseError, lexer, python};
 use ast::Location;
 use itertools::Itertools;
 use std::iter;
 use crate::soft_keywords::soft_keywords;
 /// Parse a full Python program usually consisting of multiple lines.
 ///  
@ -190,7 +190,7 @@ pub fn parse_tokens(
        .chain(lxr)
        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
    python::TopParser::new()
-        .parse(soft_keywords(tokenizer, mode).into_iter())
+        .parse(SoftKeywordTransformer::new(tokenizer, mode).into_iter())
        .map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path))
 }
--- a/parser/src/soft_keywords.rs
+++ b/parser/src/soft_keywords.rs
@ -1,20 +1,10 @@
 use itertools::{Itertools, MultiPeek};
 use crate::lexer::{LexResult, Tok};
 pub use crate::mode::Mode;
-/// Collect all tokens from a token stream in a vector.
+/// An [`Iterator`] that transforms a token stream to accommodate soft keywords (namely, `match`
-fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResult> {
+/// and `case`).
    let mut tokens: Vec<LexResult> = vec![];
    for tok in tokenizer {
        let is_err = tok.is_err();
        tokens.push(tok);
        if is_err {
            break;
        }
    }
    tokens
 }
 /// Modify a token stream to accommodate soft keywords (namely, `match` and `case`).
 ///
 /// [PEP 634](https://www.python.org/dev/peps/pep-0634/) introduced the `match` and `case` keywords
 /// as soft keywords, meaning that they can be used as identifiers (e.g., variable names) in certain
@ -25,93 +15,112 @@ fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResu
 ///
 /// Handling soft keywords in this intermediary pass allows us to simplify both the lexer and
 /// parser, as neither of them need to be aware of soft keywords.
-pub fn soft_keywords(
+pub struct SoftKeywordTransformer<I>
-    tokenizer: impl IntoIterator<Item = LexResult>,
+where
-    mode: Mode,
+    I: Iterator<Item = LexResult>,
-) -> Vec<LexResult> {
+{
-    let mut tokenizer: Vec<LexResult> = collect_tokens(tokenizer);
+    pub underlying: MultiPeek<I>,
-    let mut start_of_line = matches!(mode, Mode::Module | Mode::Interactive);
+    pub start_of_line: bool,
-    for i in 0..tokenizer.len() {
+}
-        // If the token is a `match` or `case` token, check if it's used as an identifier.
+
-        // We assume every `match` or `case` is an identifier unless both of the following
+impl<I> SoftKeywordTransformer<I>
-        // conditions are met:
+where
-        // 1. The token is at the start of a logical line.
+    I: Iterator<Item = LexResult>,
-        // 2. The logical line contains a top-level colon (that is, a colon that is not nested
+{
-        //    inside a parenthesized expression, list, or dictionary).
+    pub fn new(tokenizer: I, mode: Mode) -> Self {
-        // 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
+        Self {
-        //    (This is to avoid treating `match` and `case` as identifiers when annotated with
+            underlying: tokenizer.multipeek(),
-        //    type hints.)
+            start_of_line: matches!(mode, Mode::Interactive | Mode::Module),
-        if tokenizer[i]
+        }
-            .as_ref()
+    }
-            .map_or(false, |(_, tok, _)| matches!(tok, Tok::Match | Tok::Case))
+}
-        {
+
-            let is_identifier = {
+impl<I> Iterator for SoftKeywordTransformer<I>
-                if !start_of_line {
+where
-                    // If the `match` or `case` token is not at the start of a line, it's definitely
+    I: Iterator<Item = LexResult>,
-                    // an identifier.
+{
-                    true
+    type Item = LexResult;
    #[inline]
    fn next(&mut self) -> Option<LexResult> {
        let mut next = self.underlying.next();
        if let Some(Ok((start, tok, end))) = next.as_ref() {
            // If the token is a `match` or `case` token, check if it's used as an identifier.
            // We assume every `match` or `case` is an identifier unless both of the following
            // conditions are met:
            // 1. The token is at the start of a logical line.
            // 2. The logical line contains a top-level colon (that is, a colon that is not nested
            //    inside a parenthesized expression, list, or dictionary).
            // 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
            //    (This is to avoid treating `match` and `case` as identifiers when annotated with
            //    type hints.)
            if matches!(tok, Tok::Match | Tok::Case) {
                if !self.start_of_line {
                    next = Some(Ok((
                        *start,
                        Tok::Name {
                            name: if matches!(tok, Tok::Match) {
                                "match".to_string()
                            } else {
                                "case".to_string()
                            },
                        },
                        *end,
                    )));
                } else {
                    //
                    let mut seen_colon = false;
                    let mut first = true;
                    let mut par_count = 0;
                    let mut sqb_count = 0;
                    let mut brace_count = 0;
-                    for (_, tok, _) in tokenizer.iter().skip(i + 1).flatten() {
+                    let mut first = true;
                    let mut seen_colon = false;
                    while let Some(Ok((_, tok, _))) = self.underlying.peek() {
                        match tok {
                            Tok::Newline => break,
                            Tok::Colon if par_count == 0 && sqb_count == 0 && brace_count == 0 => {
                                if !first {
                                    seen_colon = true;
                                }
                                break;
                            }
                            Tok::Lpar => {
                                par_count += 1;
                            }
                            Tok::Rpar => {
                                par_count -= 1;
                            }
                            Tok::Lsqb => {
                                sqb_count += 1;
                            }
                            Tok::Rsqb => {
                                sqb_count -= 1;
                            }
                            Tok::Lbrace => {
                                brace_count += 1;
                            }
                            Tok::Rbrace => {
                                brace_count -= 1;
                            }
                            Tok::Lpar => par_count += 1,
                            Tok::Rpar => par_count -= 1,
                            Tok::Lsqb => sqb_count += 1,
                            Tok::Rsqb => sqb_count -= 1,
                            Tok::Lbrace => brace_count += 1,
                            Tok::Rbrace => brace_count -= 1,
                            _ => {}
                        }
                        first = false;
                    }
-                    !seen_colon
+                    if !seen_colon {
-                }
+                        next = Some(Ok((
-            };
+                            *start,
-            if is_identifier {
+                            Tok::Name {
-                if let Ok((_, tok, _)) = &mut tokenizer[i] {
+                                name: if matches!(tok, Tok::Match) {
-                    if let Tok::Match = tok {
+                                    "match".to_string()
-                        *tok = Tok::Name {
+                                } else {
-                            name: "match".to_string(),
+                                    "case".to_string()
-                        };
+                                },
-                    } else if let Tok::Case = tok {
+                            },
-                        *tok = Tok::Name {
+                            *end,
-                            name: "case".to_string(),
+                        )));
                        };
                    }
                }
            }
        }
        start_of_line = tokenizer[i].as_ref().map_or(false, |(_, tok, _)| {
            matches!(
                tok,
                Tok::StartModule | Tok::StartInteractive | Tok::Newline | Tok::Indent | Tok::Dedent
            )
        });
    }
-    tokenizer
+        self.start_of_line = next.as_ref().map_or(false, |lex_result| {
            lex_result.as_ref().map_or(false, |(_, tok, _)| {
                matches!(
                    tok,
                    Tok::StartModule
                        | Tok::StartInteractive
                        | Tok::Newline
                        | Tok::Indent
                        | Tok::Dedent
                )
            })
        });
        next
    }
 }