Use muiltipeek

2025-08-30 23:27:39 +00:00 · 2023-02-19 22:37:00 -05:00 · 2023-02-19 22:37:00 -05:00 · f1f31324d0
commit f1f31324d0
parent 8649bf6f8f
2 changed files with 94 additions and 85 deletions
--- a/parser/src/parser.rs
+++ b/parser/src/parser.rs
@ -14,11 +14,11 @@

 use crate::lexer::{LexResult, Tok};
 pub use crate::mode::Mode;
+use crate::soft_keywords::SoftKeywordTransformer;
 use crate::{ast, error::ParseError, lexer, python};
 use ast::Location;
 use itertools::Itertools;
 use std::iter;
-use crate::soft_keywords::soft_keywords;

 /// Parse a full Python program usually consisting of multiple lines.
 ///  
@ -190,7 +190,7 @@ pub fn parse_tokens(
        .chain(lxr)
        .filter_ok(|(_, tok, _)| !matches!(tok, Tok::Comment { .. } | Tok::NonLogicalNewline));
    python::TopParser::new()
-        .parse(soft_keywords(tokenizer, mode).into_iter())
+        .parse(SoftKeywordTransformer::new(tokenizer, mode).into_iter())
        .map_err(|e| crate::error::parse_error_from_lalrpop(e, source_path))
 }

--- a/parser/src/soft_keywords.rs
+++ b/parser/src/soft_keywords.rs
@ -1,20 +1,10 @@
+use itertools::{Itertools, MultiPeek};
+
 use crate::lexer::{LexResult, Tok};
 pub use crate::mode::Mode;

-/// Collect all tokens from a token stream in a vector.
-fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResult> {
-    let mut tokens: Vec<LexResult> = vec![];
-    for tok in tokenizer {
-        let is_err = tok.is_err();
-        tokens.push(tok);
-        if is_err {
-            break;
-        }
-    }
-    tokens
-}
-
-/// Modify a token stream to accommodate soft keywords (namely, `match` and `case`).
+/// An [`Iterator`] that transforms a token stream to accommodate soft keywords (namely, `match`
+/// and `case`).
 ///
 /// [PEP 634](https://www.python.org/dev/peps/pep-0634/) introduced the `match` and `case` keywords
 /// as soft keywords, meaning that they can be used as identifiers (e.g., variable names) in certain
@ -25,93 +15,112 @@ fn collect_tokens(tokenizer: impl IntoIterator<Item = LexResult>) -> Vec<LexResu
 ///
 /// Handling soft keywords in this intermediary pass allows us to simplify both the lexer and
 /// parser, as neither of them need to be aware of soft keywords.
-pub fn soft_keywords(
-    tokenizer: impl IntoIterator<Item = LexResult>,
-    mode: Mode,
-) -> Vec<LexResult> {
-    let mut tokenizer: Vec<LexResult> = collect_tokens(tokenizer);
-    let mut start_of_line = matches!(mode, Mode::Module | Mode::Interactive);
-    for i in 0..tokenizer.len() {
-        // If the token is a `match` or `case` token, check if it's used as an identifier.
-        // We assume every `match` or `case` is an identifier unless both of the following
-        // conditions are met:
-        // 1. The token is at the start of a logical line.
-        // 2. The logical line contains a top-level colon (that is, a colon that is not nested
-        //    inside a parenthesized expression, list, or dictionary).
-        // 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
-        //    (This is to avoid treating `match` and `case` as identifiers when annotated with
-        //    type hints.)
-        if tokenizer[i]
-            .as_ref()
-            .map_or(false, |(_, tok, _)| matches!(tok, Tok::Match | Tok::Case))
-        {
-            let is_identifier = {
-                if !start_of_line {
-                    // If the `match` or `case` token is not at the start of a line, it's definitely
-                    // an identifier.
-                    true
+pub struct SoftKeywordTransformer<I>
+where
+    I: Iterator<Item = LexResult>,
+{
+    pub underlying: MultiPeek<I>,
+    pub start_of_line: bool,
+}
+
+impl<I> SoftKeywordTransformer<I>
+where
+    I: Iterator<Item = LexResult>,
+{
+    pub fn new(tokenizer: I, mode: Mode) -> Self {
+        Self {
+            underlying: tokenizer.multipeek(),
+            start_of_line: matches!(mode, Mode::Interactive | Mode::Module),
+        }
+    }
+}
+
+impl<I> Iterator for SoftKeywordTransformer<I>
+where
+    I: Iterator<Item = LexResult>,
+{
+    type Item = LexResult;
+
+    #[inline]
+    fn next(&mut self) -> Option<LexResult> {
+        let mut next = self.underlying.next();
+        if let Some(Ok((start, tok, end))) = next.as_ref() {
+            // If the token is a `match` or `case` token, check if it's used as an identifier.
+            // We assume every `match` or `case` is an identifier unless both of the following
+            // conditions are met:
+            // 1. The token is at the start of a logical line.
+            // 2. The logical line contains a top-level colon (that is, a colon that is not nested
+            //    inside a parenthesized expression, list, or dictionary).
+            // 3. The top-level colon is not the immediate sibling of a `match` or `case` token.
+            //    (This is to avoid treating `match` and `case` as identifiers when annotated with
+            //    type hints.)
+            if matches!(tok, Tok::Match | Tok::Case) {
+                if !self.start_of_line {
+                    next = Some(Ok((
+                        *start,
+                        Tok::Name {
+                            name: if matches!(tok, Tok::Match) {
+                                "match".to_string()
+                            } else {
+                                "case".to_string()
+                            },
+                        },
+                        *end,
+                    )));
                } else {
-                    //
-                    let mut seen_colon = false;
-                    let mut first = true;
                    let mut par_count = 0;
                    let mut sqb_count = 0;
                    let mut brace_count = 0;
-                    for (_, tok, _) in tokenizer.iter().skip(i + 1).flatten() {
+                    let mut first = true;
+                    let mut seen_colon = false;
+                    while let Some(Ok((_, tok, _))) = self.underlying.peek() {
                        match tok {
                            Tok::Newline => break,
                            Tok::Colon if par_count == 0 && sqb_count == 0 && brace_count == 0 => {
                                if !first {
                                    seen_colon = true;
                                }
-                                break;
-                            }
-                            Tok::Lpar => {
-                                par_count += 1;
-                            }
-                            Tok::Rpar => {
-                                par_count -= 1;
-                            }
-                            Tok::Lsqb => {
-                                sqb_count += 1;
-                            }
-                            Tok::Rsqb => {
-                                sqb_count -= 1;
-                            }
-                            Tok::Lbrace => {
-                                brace_count += 1;
-                            }
-                            Tok::Rbrace => {
-                                brace_count -= 1;
                            }
+                            Tok::Lpar => par_count += 1,
+                            Tok::Rpar => par_count -= 1,
+                            Tok::Lsqb => sqb_count += 1,
+                            Tok::Rsqb => sqb_count -= 1,
+                            Tok::Lbrace => brace_count += 1,
+                            Tok::Rbrace => brace_count -= 1,
                            _ => {}
                        }
                        first = false;
                    }
-                    !seen_colon
-                }
-            };
-            if is_identifier {
-                if let Ok((_, tok, _)) = &mut tokenizer[i] {
-                    if let Tok::Match = tok {
-                        *tok = Tok::Name {
-                            name: "match".to_string(),
-                        };
-                    } else if let Tok::Case = tok {
-                        *tok = Tok::Name {
-                            name: "case".to_string(),
-                        };
+                    if !seen_colon {
+                        next = Some(Ok((
+                            *start,
+                            Tok::Name {
+                                name: if matches!(tok, Tok::Match) {
+                                    "match".to_string()
+                                } else {
+                                    "case".to_string()
+                                },
+                            },
+                            *end,
+                        )));
                    }
                }
            }
        }
-        start_of_line = tokenizer[i].as_ref().map_or(false, |(_, tok, _)| {
-            matches!(
-                tok,
-                Tok::StartModule | Tok::StartInteractive | Tok::Newline | Tok::Indent | Tok::Dedent
-            )
-        });
-    }

-    tokenizer
+        self.start_of_line = next.as_ref().map_or(false, |lex_result| {
+            lex_result.as_ref().map_or(false, |(_, tok, _)| {
+                matches!(
+                    tok,
+                    Tok::StartModule
+                        | Tok::StartInteractive
+                        | Tok::Newline
+                        | Tok::Indent
+                        | Tok::Dedent
+                )
+            })
+        });
+
+        next
+    }
 }